From 52ab4df94c7a279e115ceb11f93478fe8c90ba98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Reboud?=
 <jean-francois.reboud@owkin.com>
Date: Sun, 12 May 2024 22:17:05 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A8=20refactor:=20handle=20float16=20a?=
 =?UTF-8?q?long=20float=20on=20GPU=20(#120)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CHANGELOG.md                                  |    1 +
 Package.swift                                 |    2 +-
 .../GrAIdient/Core/Function/Activation.swift  |   20 +-
 Sources/GrAIdient/Core/Layer/LayerInput.swift |   60 +-
 .../Core/Layer/LayerNormalization.swift       |  210 +-
 .../GrAIdient/Core/Layer/LayerUpdate.swift    |  291 +-
 Sources/GrAIdient/Core/Model/Model.swift      |    2 +-
 .../Core/Optimizer/OptimizerAlgorithm.swift   |   46 +-
 .../Core/Optimizer/OptimizerImpl.swift        |   20 +-
 Sources/GrAIdient/Core/State/Weights.swift    |   16 +-
 Sources/GrAIdient/GrAI.swift                  |   76 +
 Sources/GrAIdient/Layer1D/Activation1D.swift  |    2 +-
 Sources/GrAIdient/Layer1D/BCE1D.swift         |    7 +-
 Sources/GrAIdient/Layer1D/BCESigmoid1D.swift  |    7 +-
 Sources/GrAIdient/Layer1D/Base/Layer1D.swift  |   18 +-
 .../GrAIdient/Layer1D/Base/LayerInput1D.swift |   12 +-
 .../Layer1D/Base/LayerOutput1D.swift          |   23 +-
 Sources/GrAIdient/Layer1D/Concat1D.swift      |    5 +-
 Sources/GrAIdient/Layer1D/Constant1D.swift    |   35 +-
 Sources/GrAIdient/Layer1D/DotProduct1D.swift  |    9 +-
 .../GrAIdient/Layer1D/FullyConnected.swift    |   70 +-
 Sources/GrAIdient/Layer1D/Input1D.swift       |    8 +-
 Sources/GrAIdient/Layer1D/LinearError1D.swift |    5 +-
 Sources/GrAIdient/Layer1D/MSE1D.swift         |    7 +-
 Sources/GrAIdient/Layer1D/Sum1D.swift         |    6 +-
 Sources/GrAIdient/Layer2D/Activation2D.swift  |    2 +-
 Sources/GrAIdient/Layer2D/AdaIN.swift         |    9 +-
 Sources/GrAIdient/Layer2D/BCE2D.swift         |    7 +-
 Sources/GrAIdient/Layer2D/BCESigmoid2D.swift  |    7 +-
 Sources/GrAIdient/Layer2D/BN2D.swift          |    5 +-
 Sources/GrAIdient/Layer2D/Base/Layer2D.swift  |   20 +-
 .../GrAIdient/Layer2D/Base/LayerInput2D.swift |   17 +-
 .../Layer2D/Base/LayerOutput2D.swift          |   28 +-
 Sources/GrAIdient/Layer2D/Concat2D.swift      |    5 +-
 Sources/GrAIdient/Layer2D/Constant2D.swift    |   35 +-
 Sources/GrAIdient/Layer2D/Convolution2D.swift |   63 +-
 .../GrAIdient/Layer2D/Deconvolution2D.swift   |    9 +-
 Sources/GrAIdient/Layer2D/Input2D.swift       |    6 +-
 .../GrAIdient/Layer2D/InstanceNorm2D.swift    |    5 +-
 Sources/GrAIdient/Layer2D/MSE2D.swift         |    7 +-
 Sources/GrAIdient/Layer2D/Multiply2D.swift    |   47 +-
 Sources/GrAIdient/Layer2D/Normalize2D.swift   |    8 +-
 .../Layer2D/SimilarityBatchError2D.swift      |   12 +-
 .../GrAIdient/Layer2D/SimilarityError2D.swift |   17 +-
 Sources/GrAIdient/Layer2D/Sum2D.swift         |    6 +-
 Sources/GrAIdient/Layer2D/VQ2D.swift          |   33 +-
 .../GrAIdient/LayerSeq/ActivationSeq.swift    |    2 +-
 .../GrAIdient/LayerSeq/Base/LayerSeq.swift    |   14 +-
 Sources/GrAIdient/LayerSeq/ConcatSeq.swift    |   10 +-
 Sources/GrAIdient/LayerSeq/ConstantSeq.swift  |   59 +-
 .../LayerSeq/FullyConnectedPatch.swift        |   61 +-
 .../LayerSeq/FullyConnectedSeq.swift          |   62 +-
 Sources/GrAIdient/LayerSeq/LayerNormSeq.swift |    3 +-
 Sources/GrAIdient/LayerSeq/QuerySeq.swift     |   11 +-
 Sources/GrAIdient/LayerSeq/SumSeq.swift       |    6 +-
 Sources/GrAIdient/LayerSeq/VQSeq.swift        |   37 +-
 Sources/GrAIdient/LayerSeq/ValueSeq.swift     |   22 +-
 ...Activation.metal => ActivationFloat.metal} |   24 +-
 .../Metal/Kernel/ActivationHalf.metal         |  403 ++
 .../{BatchNorm.metal => BatchNormFloat.metal} |   14 +-
 .../Metal/Kernel/BatchNormHalf.metal          |  415 ++
 .../{Biases.metal => BiasesFloat.metal}       |    2 +-
 .../GrAIdient/Metal/Kernel/BiasesHalf.metal   |   53 +
 ...nvolution.metal => ConvolutionFloat.metal} |   20 +-
 .../Metal/Kernel/ConvolutionHalf.metal        | 1049 +++++
 ...olution.metal => DeconvolutionFloat.metal} |    8 +-
 .../Metal/Kernel/DeconvolutionHalf.metal      |  419 ++
 ...nected.metal => FullyConnectedFloat.metal} |   14 +-
 .../Metal/Kernel/FullyConnectedHalf.metal     |  347 ++
 ...h.metal => FullyConnectedPatchFloat.metal} |   16 +-
 .../Kernel/FullyConnectedPatchHalf.metal      |  529 +++
 ...Seq.metal => FullyConnectedSeqFloat.metal} |   20 +-
 .../Metal/Kernel/FullyConnectedSeqHalf.metal  |  609 +++
 ...anceNorm.metal => InstanceNormFloat.metal} |   16 +-
 .../Metal/Kernel/InstanceNormHalf.metal       |  467 +++
 .../{Layer1D.metal => Layer1DFloat.metal}     |   38 +-
 .../GrAIdient/Metal/Kernel/Layer1DHalf.metal  |  915 +++++
 .../{Layer2D.metal => Layer2DFloat.metal}     |  110 +-
 .../GrAIdient/Metal/Kernel/Layer2DHalf.metal  | 3570 +++++++++++++++++
 ...LayerMerge.metal => LayerMergeFloat.metal} |   12 +-
 .../Metal/Kernel/LayerMergeHalf.metal         |  161 +
 .../{LayerNorm.metal => LayerNormFloat.metal} |   24 +-
 .../Metal/Kernel/LayerNormHalf.metal          |  583 +++
 .../{LayerSeq.metal => LayerSeqFloat.metal}   |   90 +-
 .../GrAIdient/Metal/Kernel/LayerSeqHalf.metal | 2745 +++++++++++++
 .../{Optimizer.metal => OptimizerFloat.metal} |   18 +-
 .../Metal/Kernel/OptimizerHalf.metal          |  438 ++
 .../{Reduce.metal => ReduceFloat.metal}       |    8 +-
 .../GrAIdient/Metal/Kernel/ReduceHalf.metal   |  184 +
 .../Kernel/{Reset.metal => ResetFloat.metal}  |    2 +-
 .../GrAIdient/Metal/Kernel/ResetHalf.metal    |   77 +
 .../Kernel/{VQ2D.metal => VQ2DFloat.metal}    |   16 +-
 Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal |  544 +++
 .../Kernel/{VQSeq.metal => VQSeqFloat.metal}  |   14 +-
 .../GrAIdient/Metal/Kernel/VQSeqHalf.metal    |  472 +++
 Sources/GrAIdient/Metal/MetalBuffer.swift     |  236 ++
 Sources/GrAIdient/Metal/MetalConfig.swift     |  815 ++--
 Sources/GrAIdient/Metal/MetalKernel.swift     |   24 +-
 Sources/GrAIdient/Utils/Buffer.swift          |  159 +-
 Sources/GrAIdient/Utils/Image.swift           |   24 +-
 Tests/GrAIExamples/AutoEncoderExample.swift   |    2 +
 Tests/GrAIExamples/AutoEncoderTests.swift     |    2 +
 Tests/GrAIExamples/Base/setup.py              |    2 +-
 Tests/GrAIExamples/TransformerBenchmark.swift |   38 +-
 Tests/GrAIExamples/TransformerExample.swift   |   14 +-
 Tests/GrAIExamples/VGGBenchmark.swift         |   42 +-
 Tests/GrAIExamples/VGGExample.swift           |   14 +-
 .../Base/Input1D/Input1DBCE1DCase.swift       |    2 +
 .../Input1D/Input1DBCESigmoid1DCase.swift     |    2 +
 .../Input1D/Input1DLinearError1DCase.swift    |    2 +
 .../Base/Input1D/Input1DMSE1DCase.swift       |    2 +
 .../Base/Input2D/Input2DBCE2DCase.swift       |    2 +
 .../Input2D/Input2DBCESigmoid2DCase.swift     |    2 +
 .../Base/Input2D/Input2DMSE1DCase.swift       |    2 +
 .../Base/Input2D/Input2DMSE2DCase.swift       |    2 +
 .../Input2DSimilarityBatchError2DCase.swift   |    2 +
 .../Input2DSimilarityError2DCase.swift        |    2 +
 .../Base/Input2D/Input2DVQ2DCase.swift        |    2 +
 .../Base/Input2D/Input2DVQSeqCase.swift       |    2 +
 Tests/GrAITests/ImageTests.swift              |    7 +-
 Tests/GrAITests/Layer2DTests.swift            |   12 +-
 Tests/GrAITests/OptimizerTests.swift          |    4 +
 Tests/GrAITests/ReduceTests.swift             |  155 +-
 Tests/GrAITests/UpdateManagementTests.swift   |   18 +-
 Tests/GrAITorchTests/Base/setup.py            |    2 +-
 Tests/GrAITorchTests/GrAITorchTests.swift     |    2 +
 126 files changed, 16078 insertions(+), 1557 deletions(-)
 rename Sources/GrAIdient/Metal/Kernel/{Activation.metal => ActivationFloat.metal} (94%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{BatchNorm.metal => BatchNormFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Biases.metal => BiasesFloat.metal} (96%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Convolution.metal => ConvolutionFloat.metal} (98%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Deconvolution.metal => DeconvolutionFloat.metal} (98%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{FullyConnected.metal => FullyConnectedFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{FullyConnectedPatch.metal => FullyConnectedPatchFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{FullyConnectedSeq.metal => FullyConnectedSeqFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{InstanceNorm.metal => InstanceNormFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Layer1D.metal => Layer1DFloat.metal} (96%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Layer2D.metal => Layer2DFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{LayerMerge.metal => LayerMergeFloat.metal} (93%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{LayerNorm.metal => LayerNormFloat.metal} (96%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{LayerSeq.metal => LayerSeqFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Optimizer.metal => OptimizerFloat.metal} (96%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Reduce.metal => ReduceFloat.metal} (97%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{Reset.metal => ResetFloat.metal} (94%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/ResetHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{VQ2D.metal => VQ2DFloat.metal} (98%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal
 rename Sources/GrAIdient/Metal/Kernel/{VQSeq.metal => VQSeqFloat.metal} (98%)
 create mode 100644 Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal

diff --git a/CHANGELOG.md b/CHANGELOG.md
index df809de1..0fe68551 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
 
 ## [unreleased]
 
+🔨 **refactor:** handle float16 along float on GPU ([#120](https://github.com/owkin/GrAIdient/pull/120))\
 🚀 **perf:** copy & generate weights faster ([119](https://github.com/owkin/GrAIdient/pull/119))\
 🚀 **perf:** Convolution2D ([118](https://github.com/owkin/GrAIdient/pull/118))\
 🪜 **feat:** LayerCAM2D -> VQGrad2D, LayerCAMSeq -> VQGradSeq ([#117](https://github.com/owkin/GrAIdient/pull/117))\
diff --git a/Package.swift b/Package.swift
index 8cc64efb..a386a0a9 100644
--- a/Package.swift
+++ b/Package.swift
@@ -7,7 +7,7 @@ import PackageDescription
 let package = Package(
     name: "GrAIdient",
     platforms: [
-        .macOS(.v10_15)
+        .macOS(.v13)
     ],
     products: [
         .library(
diff --git a/Sources/GrAIdient/Core/Function/Activation.swift b/Sources/GrAIdient/Core/Function/Activation.swift
index edb79edd..0e6bc93e 100644
--- a/Sources/GrAIdient/Core/Function/Activation.swift
+++ b/Sources/GrAIdient/Core/Function/Activation.swift
@@ -307,8 +307,8 @@ open class ActivationFunction: Codable
     ///     - deviceID: GPU device where to execute the operation.
     ///
     private func _forwardGPU(
-        tmp: MetalBuffer<Float>,
-        outs: MetalBuffer<Float>,
+        tmp: FloatBuffer,
+        outs: FloatBuffer,
         deviceID: Int)
     {
         let nbElems = outs.nbElems
@@ -335,8 +335,9 @@ open class ActivationFunction: Codable
         let nbElems = layer.outs.nbElems
         if layer._tmp == nil
         {
-            layer._tmp = MetalPrivateBuffer<Float>(
-                nbElems, deviceID: layer.deviceID)
+            layer._tmp = FloatBuffer(
+                nbElems: nbElems, deviceID: layer.deviceID
+            )
         }
         _forwardGPU(
             tmp: layer._tmp,
@@ -355,7 +356,7 @@ open class ActivationFunction: Codable
         let nbElems = layer.outs.nbElems
         if layer._tmp == nil
         {
-            layer._tmp = MetalPrivateBuffer<Float>(
+            layer._tmp = FloatBuffer(nbElems: 
                 nbElems, deviceID: layer.deviceID)
         }
         _forwardGPU(
@@ -375,8 +376,9 @@ open class ActivationFunction: Codable
         let nbElems = layer.outs.nbElems
         if layer._tmp == nil
         {
-            layer._tmp = MetalPrivateBuffer<Float>(
-                nbElems, deviceID: layer.deviceID)
+            layer._tmp = FloatBuffer(
+                nbElems: nbElems, deviceID: layer.deviceID
+            )
         }
         _forwardGPU(
             tmp: layer._tmp,
@@ -394,8 +396,8 @@ open class ActivationFunction: Codable
     ///     - deviceID: GPU device where to execute the operation.
     ///
     private func _backwardGPU(
-        tmp: MetalBuffer<Float>,
-        delta: MetalBuffer<Float>,
+        tmp: FloatBuffer,
+        delta: FloatBuffer,
         deviceID: Int)
     {
         let nbElems = delta.nbElems
diff --git a/Sources/GrAIdient/Core/Layer/LayerInput.swift b/Sources/GrAIdient/Core/Layer/LayerInput.swift
index c3cf7e81..d9ba95b5 100644
--- a/Sources/GrAIdient/Core/Layer/LayerInput.swift
+++ b/Sources/GrAIdient/Core/Layer/LayerInput.swift
@@ -105,14 +105,13 @@ class InputBuffers<T: Layer>
 {
     /// The link to the layer.
     unowned let _layer: T
-    /// Number of elements in the different buffers.
-    let nbElems: Int
-    /// GPU device where the buffers are sent.
-    let deviceID: Int
     
-    var _m: MetalBuffer<Float>! = nil
-    var _v: MetalBuffer<Float>! = nil
-    var _vHat: MetalBuffer<Float>! = nil
+    /// Momentum buffer.
+    public let m: FloatBuffer
+    /// Velocity buffer.
+    public let v: FloatBuffer
+    /// Velocity normalized buffer.
+    public let vHat: FloatBuffer
     
     ///
     /// Create a container of buffers.
@@ -127,51 +126,16 @@ class InputBuffers<T: Layer>
          deviceID: Int)
     {
         _layer = layer
-        self.nbElems = nbElems
-        self.deviceID = deviceID
-    }
-    
-    /// Momentum buffer.
-    var m: MetalBuffer<Float>
-    {
-        get {
-            if _m == nil
-            {
-                _m = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _m
-        }
-    }
-    
-    /// Velocity buffer.
-    var v: MetalBuffer<Float>
-    {
-        get {
-            if _v == nil
-            {
-                _v = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _v
-        }
-    }
-    
-    /// Velocity normalized buffer.
-    var vHat: MetalBuffer<Float>
-    {
-        get {
-            if _vHat == nil
-            {
-                _vHat = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _vHat
-        }
+        m = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
+        v = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
+        vHat = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
     }
     
     /// Clean the momentum..., preserving the weights.
     func reset()
     {
-        _m = nil
-        _v = nil
-        _vHat = nil
+        m.reset()
+        v.reset()
+        vHat.reset()
     }
 }
diff --git a/Sources/GrAIdient/Core/Layer/LayerNormalization.swift b/Sources/GrAIdient/Core/Layer/LayerNormalization.swift
index c572ff77..2ac13f33 100644
--- a/Sources/GrAIdient/Core/Layer/LayerNormalization.swift
+++ b/Sources/GrAIdient/Core/Layer/LayerNormalization.swift
@@ -620,7 +620,7 @@ public class BatchNormalization: LayerWeightsStatsNormalization
     }
     
     /// Get the weights in the CPU execution context.
-    func collectWeights() -> [IWeightArrays]
+    func collectWeights() -> [WeightArrays]
     {
         return [_Ɣ, _β]
     }
@@ -633,50 +633,50 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
     /// Buffer of weights to scale the normalization result.
     /// Shape ~ (nbNeurons,).
     ///
-    var _Ɣ: IWeightBuffers! = nil
+    var _Ɣ: WeightBuffers! = nil
     ///
     /// Buffer of biases to add to the normalization result.
     /// Shape ~ (nbNeurons,).
     ///
-    var _β: IWeightBuffers! = nil
+    var _β: WeightBuffers! = nil
     
     ///
     /// Buffer of averages of data for the different independent batch normalization units.
     /// Shape ~ (nbNeurons,).
     ///
-    var _μ: MetalBuffer<Float>! = nil
+    var _μ: FloatBuffer! = nil
     ///
     /// Buffer of global averages of data for the different independent batch normalization units.
     /// Shape ~ (nbNeurons,).
     ///
-    var _Eμ: MetalPrivateBuffer<Float>! = nil
+    var _Eμ: FloatBuffer! = nil
     ///
     /// Buffer of deviations of data for the different independent batch normalization units.
     /// Shape ~ (nbNeurons,).
     ///
-    var _σ2: MetalBuffer<Float>! = nil
+    var _σ2: FloatBuffer! = nil
     ///
     /// Buffer of global deviations of data for the different independent batch normalization units.
     /// Shape ~ (nbNeurons,).
     ///
-    var _Eσ2: MetalPrivateBuffer<Float>! = nil
+    var _Eσ2: FloatBuffer! = nil
     
     ///
     /// Buffer of data normalized without taking into account the biases and the weights.
     /// Shape ~ (batch, nbNeurons, height, width).
     ///
-    var _xHat: MetalBuffer<Float>! = nil
+    var _xHat: FloatBuffer! = nil
     
     ///
     /// Buffer used to compute backward pass.
     /// Shape ~ (nbNeurons,).
     ///
-    var _sum1: MetalBuffer<Float>! = nil
+    var _sum1: FloatBuffer! = nil
     ///
     /// Buffer used to compute backward pass.
     /// Shape ~ (nbNeurons,).
     ///
-    var _sum2: MetalBuffer<Float>! = nil
+    var _sum2: FloatBuffer! = nil
    
     /// GPU device on which model is executed.
     var _deviceID = 0
@@ -690,11 +690,8 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
                 return super.weights
             }
             
-            MetalKernel.get.download([_β.w_p!, _Ɣ.w_p!])
-            
-            var weightsTmp = [Float]()
-            weightsTmp += _Ɣ.w_p!.shared.array
-            weightsTmp += _β.w_p!.shared.array
+            var weightsTmp = _Ɣ!.w.download()
+            weightsTmp += _β!.w.download()
             return weightsTmp
         }
         set {
@@ -717,11 +714,8 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
                 return super.stats
             }
             
-            MetalKernel.get.download([_Eμ, _Eσ2])
-            
-            var statsTmp = [Float]()
-            statsTmp += _Eμ.shared.array
-            statsTmp += _Eσ2.shared.array
+            var statsTmp = _Eμ.download()
+            statsTmp += _Eσ2.download()
             return statsTmp
         }
         set {
@@ -781,58 +775,38 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
         _β = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID)
         _Ɣ = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID)
         
-        let βPtr = _β.w_p!.shared.buffer
-        let ƔPtr = _Ɣ.w_p!.shared.buffer
-        
         if _weightsList.count == 0
         {
+            _weightsList = [Float](repeating: 0.0, count: 2 * _nbNeurons)
             for depth in 0..<_nbNeurons
             {
-                ƔPtr[depth] = 1.0
-                βPtr[depth] = 0.0
-            }
-        }
-        else
-        {
-            for depth in 0..<_nbNeurons
-            {
-                ƔPtr[depth] = _weightsList[depth]
-                βPtr[depth] = _weightsList[_nbNeurons + depth]
+                _weightsList[depth] = 1.0
             }
-            _weightsList = []
         }
         
-        MetalKernel.get.upload([_β.w_p!, _Ɣ.w_p!])
+        _Ɣ.w.initialize(array: &_weightsList)
+        _β.w.initialize(array: &_weightsList, start: _nbNeurons)
+        
+        _weightsList = []
     }
     
     /// Initialize stats in the GPU execution context.
     func initStats()
     {
-        _Eμ = MetalPrivateBuffer<Float>(_nbNeurons, deviceID: _deviceID)
-        _Eσ2 = MetalPrivateBuffer<Float>(_nbNeurons, deviceID: _deviceID)
-        
-        let EμPtr = _Eμ.shared.buffer
-        let Eσ2Ptr = _Eσ2.shared.buffer
+        _Eμ = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID)
+        _Eσ2 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID)
         
-        if _statsList.count == 0
+        if _statsList.count != 0
         {
-            for depth in 0..<_nbNeurons
-            {
-                EμPtr[depth] = 0.0
-                Eσ2Ptr[depth] = 0.0
-            }
+            _Eμ.initialize(array: &_statsList)
+            _Eσ2.initialize(array: &_statsList, start: _nbNeurons)
         }
         else
         {
-            for depth in 0..<_nbNeurons
-            {
-                EμPtr[depth] = _statsList[depth]
-                Eσ2Ptr[depth] = _statsList[_nbNeurons + depth]
-            }
-            _statsList = []
+            _Eμ.initialize()
+            _Eσ2.initialize()
         }
-        
-        MetalKernel.get.upload([_Eμ, _Eσ2])
+        _statsList = []
     }
     
     ///
@@ -880,7 +854,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
         
         if _μ == nil
         {
-            _μ = MetalPrivateBuffer<Float>(_nbNeurons, deviceID: _deviceID)
+            _μ = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID)
         }
         
         let command = MetalKernel.get.createCommand(
@@ -913,7 +887,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
         
         if _σ2 == nil
         {
-            _σ2 = MetalPrivateBuffer<Float>(_nbNeurons, deviceID: _deviceID)
+            _σ2 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID)
         }
         
         let command = MetalKernel.get.createCommand(
@@ -948,7 +922,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
         
         if _xHat == nil
         {
-            _xHat = MetalPrivateBuffer<Float>(
+            _xHat = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons * width * height,
                 deviceID: _deviceID
             )
@@ -1039,8 +1013,8 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
         
         if _sum1 == nil
         {
-            _sum1 = MetalPrivateBuffer<Float>(_nbNeurons, deviceID: _deviceID)
-            _sum2 = MetalPrivateBuffer<Float>(_nbNeurons, deviceID: _deviceID)
+            _sum1 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID)
+            _sum2 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID)
         }
         
         let command = MetalKernel.get.createCommand(
@@ -1126,7 +1100,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization
     }
     
     /// Get the weights in the GPU execution context.
-    func collectWeights() -> [IWeightBuffers]
+    func collectWeights() -> [WeightBuffers]
     {
         return [_Ɣ, _β]
     }
@@ -1475,7 +1449,7 @@ public class InstanceNormalization: LayerWeightsNormalization
     }
     
     /// Get the weights in the CPU execution context.
-    func collectWeights() -> [IWeightArrays]
+    func collectWeights() -> [WeightArrays]
     {
         return [_Ɣ, _β]
     }
@@ -1488,40 +1462,40 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
     /// Buffer of weights to scale the normalization result.
     /// Shape ~ (nbNeurons,).
     ///
-    var _Ɣ: IWeightBuffers! = nil
+    var _Ɣ: WeightBuffers! = nil
     ///
     /// Buffer of biases to add to the normalization result.
     /// Shape ~ (nbNeurons,).
     ///
-    var _β: IWeightBuffers! = nil
+    var _β: WeightBuffers! = nil
     
     ///
     /// Buffer of averages of data for the different independent batch normalization units.
     /// Shape ~ (batch, nbNeurons).
     ///
-    var _μ: MetalBuffer<Float>! = nil
+    var _μ: FloatBuffer! = nil
     ///
     /// Buffer of deviations of data for the different independent batch normalization units.
     /// Shape ~ (batch, nbNeurons).
     ///
-    var _σ2: MetalBuffer<Float>! = nil
+    var _σ2: FloatBuffer! = nil
     
     ///
     /// Buffer of data normalized without taking into account the biases and the weights.
     /// Shape ~ (batch, nbNeurons, height, width).
     ///
-    var _xHat: MetalBuffer<Float>! = nil
+    var _xHat: FloatBuffer! = nil
     
     ///
     /// Buffer used to compute backward pass.
     /// Shape ~ (nbNeurons,).
     ///
-    var _sum1: MetalBuffer<Float>! = nil
+    var _sum1: FloatBuffer! = nil
     ///
     /// Buffer used to compute backward pass.
     /// Shape ~ (nbNeurons,).
     ///
-    var _sum2: MetalBuffer<Float>! = nil
+    var _sum2: FloatBuffer! = nil
    
     /// GPU device on which model is executed.
     var _deviceID = 0
@@ -1535,11 +1509,8 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
                 return super.weights
             }
             
-            MetalKernel.get.download([_β.w_p!, _Ɣ.w_p!])
-            
-            var weightsTmp = [Float]()
-            weightsTmp += _Ɣ.w_p!.shared.array
-            weightsTmp += _β.w_p!.shared.array
+            var weightsTmp = _Ɣ!.w.download()
+            weightsTmp += _β!.w.download()
             return weightsTmp
         }
         set {
@@ -1597,28 +1568,19 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         _β = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID)
         _Ɣ = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID)
         
-        let βPtr = _β.w_p!.shared.buffer
-        let ƔPtr = _Ɣ.w_p!.shared.buffer
-        
         if _weightsList.count == 0
         {
+            _weightsList = [Float](repeating: 0.0, count: 2 * _nbNeurons)
             for depth in 0..<_nbNeurons
             {
-                ƔPtr[depth] = 1.0
-                βPtr[depth] = 0.0
-            }
-        }
-        else
-        {
-            for depth in 0..<_nbNeurons
-            {
-                ƔPtr[depth] = _weightsList[depth]
-                βPtr[depth] = _weightsList[_nbNeurons + depth]
+                _weightsList[depth] = 1.0
             }
-            _weightsList = []
         }
         
-        MetalKernel.get.upload([_β.w_p!, _Ɣ.w_p!])
+        _Ɣ.w.initialize(array: &_weightsList)
+        _β.w.initialize(array: &_weightsList, start: _nbNeurons)
+        
+        _weightsList = []
     }
     
     ///
@@ -1654,7 +1616,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _xHat == nil
         {
-            _xHat = MetalPrivateBuffer<Float>(
+            _xHat = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons * width * height,
                 deviceID: _deviceID
             )
@@ -1698,7 +1660,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _xHat == nil
         {
-            _xHat = MetalPrivateBuffer<Float>(
+            _xHat = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons * width * height,
                 deviceID: _deviceID
             )
@@ -1738,7 +1700,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _μ == nil
         {
-            _μ = MetalPrivateBuffer<Float>(
+            _μ = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
         }
@@ -1771,7 +1733,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _μ == nil
         {
-            _μ = MetalPrivateBuffer<Float>(
+            _μ = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
         }
@@ -1803,7 +1765,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _σ2 == nil
         {
-            _σ2 = MetalPrivateBuffer<Float>(
+            _σ2 = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
         }
@@ -1837,7 +1799,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _σ2 == nil
         {
-            _σ2 = MetalPrivateBuffer<Float>(
+            _σ2 = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
         }
@@ -1941,10 +1903,10 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _sum1 == nil
         {
-            _sum1 = MetalPrivateBuffer<Float>(
+            _sum1 = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
-            _sum2 = MetalPrivateBuffer<Float>(
+            _sum2 = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
         }
@@ -1983,10 +1945,10 @@ class InstanceNormalizationGPU: LayerWeightsNormalization
         
         if _sum1 == nil
         {
-            _sum1 = MetalPrivateBuffer<Float>(
+            _sum1 = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
-            _sum2 = MetalPrivateBuffer<Float>(
+            _sum2 = FloatBuffer(nbElems: 
                 batchSize * _nbNeurons, deviceID: _deviceID
             )
         }
@@ -2359,40 +2321,40 @@ class LayerNormalizationGPU: LayerWeightsNormalization
     /// Buffer of weights to scale the normalization result.
     /// Shape ~ (nbNeurons,).
     ///
-    var _Ɣ: IWeightBuffers! = nil
+    var _Ɣ: WeightBuffers! = nil
     ///
     /// Buffer of biases to add to the normalization result.
     /// Shape ~ (nbNeurons,).
     ///
-    var _β: IWeightBuffers! = nil
+    var _β: WeightBuffers! = nil
     
     ///
     /// Buffer of averages of data for the different independent batch normalization units.
     /// Shape ~ (batch, sequence).
     ///
-    var _μ: MetalBuffer<Float>! = nil
+    var _μ: FloatBuffer! = nil
     ///
     /// Buffer of deviations of data for the different independent batch normalization units.
     /// Shape ~ (batch, sequence).
     ///
-    var _σ2: MetalBuffer<Float>! = nil
+    var _σ2: FloatBuffer! = nil
     
     ///
     /// Buffer of data normalized without taking into account the biases and the weights.
     /// Shape ~ (batch, sequence, nbNeurons).
     ///
-    var _xHat: MetalBuffer<Float>! = nil
+    var _xHat: FloatBuffer! = nil
     
     ///
     /// Buffer used to compute backward pass.
     /// Shape ~ (batch, sequence).
     ///
-    var _sum1: MetalBuffer<Float>! = nil
+    var _sum1: FloatBuffer! = nil
     ///
     /// Buffer used to compute backward pass.
     /// Shape ~ (batch, sequence).
     ///
-    var _sum2: MetalBuffer<Float>! = nil
+    var _sum2: FloatBuffer! = nil
    
     /// GPU device on which model is executed.
     var _deviceID = 0
@@ -2406,11 +2368,8 @@ class LayerNormalizationGPU: LayerWeightsNormalization
                 return super.weights
             }
             
-            MetalKernel.get.download([_β.w_p!, _Ɣ.w_p!])
-            
-            var weightsTmp = [Float]()
-            weightsTmp += _Ɣ.w_p!.shared.array
-            weightsTmp += _β.w_p!.shared.array
+            var weightsTmp = _Ɣ!.w.download()
+            weightsTmp += _β!.w.download()
             return weightsTmp
         }
         set {
@@ -2468,28 +2427,19 @@ class LayerNormalizationGPU: LayerWeightsNormalization
         _β = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID)
         _Ɣ = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID)
         
-        let βPtr = _β.w_p!.shared.buffer
-        let ƔPtr = _Ɣ.w_p!.shared.buffer
-        
         if _weightsList.count == 0
         {
+            _weightsList = [Float](repeating: 0.0, count: 2 * _nbNeurons)
             for depth in 0..<_nbNeurons
             {
-                ƔPtr[depth] = 1.0
-                βPtr[depth] = 0.0
+                _weightsList[depth] = 1.0
             }
         }
-        else
-        {
-            for depth in 0..<_nbNeurons
-            {
-                ƔPtr[depth] = _weightsList[depth]
-                βPtr[depth] = _weightsList[_nbNeurons + depth]
-            }
-            _weightsList = []
-        }
         
-        MetalKernel.get.upload([_β.w_p!, _Ɣ.w_p!])
+        _Ɣ.w.initialize(array: &_weightsList)
+        _β.w.initialize(array: &_weightsList, start: _nbNeurons)
+        
+        _weightsList = []
     }
     
     ///
@@ -2524,7 +2474,7 @@ class LayerNormalizationGPU: LayerWeightsNormalization
         
         if _xHat == nil
         {
-            _xHat = MetalPrivateBuffer<Float>(
+            _xHat = FloatBuffer(nbElems: 
                 batchSize * sequence * _nbNeurons,
                 deviceID: _deviceID
             )
@@ -2565,7 +2515,7 @@ class LayerNormalizationGPU: LayerWeightsNormalization
         
         if _μ == nil
         {
-            _μ = MetalPrivateBuffer<Float>(
+            _μ = FloatBuffer(nbElems: 
                 batchSize * sequence, deviceID: _deviceID
             )
         }
@@ -2597,7 +2547,7 @@ class LayerNormalizationGPU: LayerWeightsNormalization
         
         if _σ2 == nil
         {
-            _σ2 = MetalPrivateBuffer<Float>(
+            _σ2 = FloatBuffer(nbElems: 
                 batchSize * sequence, deviceID: _deviceID
             )
         }
@@ -2666,10 +2616,10 @@ class LayerNormalizationGPU: LayerWeightsNormalization
         
         if _sum1 == nil
         {
-            _sum1 = MetalPrivateBuffer<Float>(
+            _sum1 = FloatBuffer(nbElems: 
                 batchSize * sequence, deviceID: _deviceID
             )
-            _sum2 = MetalPrivateBuffer<Float>(
+            _sum2 = FloatBuffer(nbElems: 
                 batchSize * sequence, deviceID: _deviceID
             )
         }
diff --git a/Sources/GrAIdient/Core/Layer/LayerUpdate.swift b/Sources/GrAIdient/Core/Layer/LayerUpdate.swift
index 92adb1fa..0a94648c 100644
--- a/Sources/GrAIdient/Core/Layer/LayerUpdate.swift
+++ b/Sources/GrAIdient/Core/Layer/LayerUpdate.swift
@@ -74,15 +74,15 @@ public protocol IWeightBuffers
     var nbElems: Int { get }
     
     /// Weights buffer: the buffer to be update.
-    var w: MetalBuffer<Float> { get }
+    var w: FloatBuffer { get }
     /// Gradients buffer.
-    var g: MetalBuffer<Float> { get }
+    var g: FloatBuffer { get }
     /// Momentum buffer.
-    var m: MetalBuffer<Float> { get }
+    var m: FloatBuffer { get }
     /// Velocity buffer.
-    var v: MetalBuffer<Float> { get }
+    var v: FloatBuffer { get }
     /// Velocity normalized buffer.
-    var vHat: MetalBuffer<Float> { get }
+    var vHat: FloatBuffer { get }
     
     /// Clean the momentum..., preserving the weights.
     func reset()
@@ -90,50 +90,35 @@ public protocol IWeightBuffers
 
 extension IWeightBuffers
 {
-    /// Get the weights as a private buffer.
-    var w_p: MetalPrivateBuffer<Float>?
-    {
-        get {
-            return w as? MetalPrivateBuffer<Float>
-        }
-    }
-    /// Get the weights as a shared buffer.
-    var w_s: MetalSharedBuffer<Float>?
-    {
-        get {
-            return w as? MetalSharedBuffer<Float>
-        }
-    }
-    
-    /// Get the gradient buffer as a private buffer.
-    var g_p: MetalPrivateBuffer<Float>?
+    /// GPU device where the buffers are sent.
+    public var deviceID: Int
     {
         get {
-            return g as? MetalPrivateBuffer<Float>
+            return w.deviceID
         }
     }
-    /// Get the gradient buffer as a shared buffer.
-    var g_s: MetalSharedBuffer<Float>?
+    /// Number of elements in the different buffers.
+    public var nbElems: Int
     {
         get {
-            return g as? MetalSharedBuffer<Float>
+            return w.nbElems
         }
     }
 }
 
 /// GPU buffers needed to update the weights.
-class WeightBuffers: IWeightBuffers
+public class WeightBuffers: IWeightBuffers
 {
-    /// Number of elements in the different buffers.
-    let nbElems: Int
-    /// GPU device where the buffers are sent.
-    let deviceID: Int
-    
-    var _w: MetalBuffer<Float>! = nil
-    var _g: MetalBuffer<Float>! = nil
-    var _m: MetalBuffer<Float>! = nil
-    var _v: MetalBuffer<Float>! = nil
-    var _vHat: MetalBuffer<Float>! = nil
+    /// Weights buffer: the buffer to be update.
+    public let w: FloatBuffer
+    /// Gradients buffer.
+    public let g: FloatBuffer
+    /// Momentum buffer.
+    public let m: FloatBuffer
+    /// Velocity buffer.
+    public let v: FloatBuffer
+    /// Velocity normalized buffer.
+    public let vHat: FloatBuffer
     
     ///
     /// Create a container of buffers.
@@ -144,78 +129,21 @@ class WeightBuffers: IWeightBuffers
     ///
     init(nbElems: Int, deviceID: Int)
     {
-        self.nbElems = nbElems
-        self.deviceID = deviceID
-    }
-    
-    /// Weights buffer: the buffer to be update.
-    var w: MetalBuffer<Float>
-    {
-        get {
-            if _w == nil
-            {
-                _w = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _w
-        }
-    }
-    
-    /// Gradients buffer.
-    var g: MetalBuffer<Float>
-    {
-        get {
-            if _g == nil
-            {
-                _g = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _g
-        }
-    }
-    
-    /// Momentum buffer.
-    var m: MetalBuffer<Float>
-    {
-        get {
-            if _m == nil
-            {
-                _m = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _m
-        }
-    }
-    
-    /// Velocity buffer.
-    var v: MetalBuffer<Float>
-    {
-        get {
-            if _v == nil
-            {
-                _v = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _v
-        }
+        w = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
+        g = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
+        m = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
+        v = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
+        vHat = FloatBuffer(nbElems: nbElems, deviceID: deviceID)
     }
     
-    /// Velocity normalized buffer.
-    var vHat: MetalBuffer<Float>
+    /// Clean the buffers.
+    public func reset()
     {
-        get {
-            if _vHat == nil
-            {
-                _vHat = MetalPrivateBuffer<Float>(nbElems, deviceID: deviceID)
-            }
-            return _vHat
-        }
-    }
-    
-    /// Clean the momentum..., preserving the weights.
-    func reset()
-    {
-        // do not touch _w
-        _g = nil
-        _m = nil
-        _v = nil
-        _vHat = nil
+        // do not touch w
+        g.reset()
+        m.reset()
+        v.reset()
+        vHat.reset()
     }
 }
 
@@ -257,7 +185,11 @@ extension LayerWeightInit
         }
     }
     
+    /// 
     /// Generate list of weights values.
+    ///
+    /// - Returns: The generated list of values.
+    ///
     public func generateWeightsList() -> [Float]
     {
         let nbElems = weightListSize
@@ -289,8 +221,16 @@ extension LayerWeightInit
         return weightsList
     }
     
+    ///
+    /// Generate weights values.
+    ///
+    /// - Parameters:
+    ///     - out: The output buffer.
+    ///     - deviceID: GPU device.
+    ///
     public func generateWeightsList(
-        buffer: UnsafeMutableBufferPointer<Float>)
+        out: FloatBuffer,
+        deviceID: Int)
     {
         let nbElems = weightListSize
         switch weightInitClass {
@@ -298,27 +238,31 @@ extension LayerWeightInit
             Self.XavierUniform(
                 nbElems: nbElems,
                 connectivityIO: connectivityIO,
-                buffer: buffer
+                out: out,
+                deviceID: deviceID
             )
         case .XavierNormal:
             Self.XavierNormal(
                 nbElems: nbElems,
                 connectivityIO: connectivityIO,
-                buffer: buffer
+                out: out,
+                deviceID: deviceID
             )
         case .KaimingUniform:
             Self.KaimingUniform(
                 nbElems: nbElems,
                 coeff: coeffInitWeights,
                 connectivityIO: connectivityIO,
-                buffer: buffer
+                out: out,
+                deviceID: deviceID
             )
         case .KaimingNormal:
             Self.KaimingNormal(
                 nbElems: nbElems,
                 coeff: coeffInitWeights,
                 connectivityIO: connectivityIO,
-                buffer: buffer
+                out: out,
+                deviceID: deviceID
             )
         }
     }
@@ -350,23 +294,28 @@ extension LayerWeightInit
     /// - Parameters:
     ///     - nbElems: Number of weights to initialize.
     ///     - connectivityIO: Number of input and output connections.
-    ///     - buffer: The buffer of values.
+    ///     - out: The output buffer.
+    ///     - deviceID: GPU device.
     ///
     static func XavierUniform(
         nbElems: Int,
         connectivityIO: (Int, Int),
-        buffer: UnsafeMutableBufferPointer<Float>)
+        out: FloatBuffer,
+        deviceID: Int)
     {
-        let bound = sqrt(6) / sqrt(Float(connectivityIO.0 + connectivityIO.1))
-        if #available(macOS 13.0, *)
+        var array = [Float](repeating: 0.0, count: nbElems)
+        array.withUnsafeMutableBufferPointer
         {
-            guard
-                var arrayDescriptor = BNNSNDArrayDescriptor(
-                    data: buffer,
-                    shape: .vector(nbElems)),
-                let randomNumberGenerator = BNNSCreateRandomGenerator(
-                    BNNSRandomGeneratorMethodAES_CTR,
-                    nil) else 
+            ptr in
+            
+            let bound = 
+                sqrt(6) / sqrt(Float(connectivityIO.0 + connectivityIO.1))
+            guard var arrayDescriptor = BNNSNDArrayDescriptor(
+                data: ptr,
+                shape: .vector(nbElems)),
+            let randomNumberGenerator = BNNSCreateRandomGenerator(
+                BNNSRandomGeneratorMethodAES_CTR,
+                nil) else
             {
                 fatalError()
             }
@@ -379,11 +328,8 @@ extension LayerWeightInit
             )
             
             BNNSDestroyRandomGenerator(randomNumberGenerator)
-        } 
-        else
-        {
-            fatalError()
         }
+        out.initialize(array: &array)
     }
     
     ///
@@ -413,23 +359,27 @@ extension LayerWeightInit
     /// - Parameters:
     ///     - nbElems: Number of weights to initialize.
     ///     - connectivityIO: Number of input and output connections.
-    ///     - buffer: The buffer of values.
+    ///     - out: The output buffer.
+    ///     - deviceID: GPU device.
     ///
     static func XavierNormal(
         nbElems: Int,
         connectivityIO: (Int, Int),
-        buffer: UnsafeMutableBufferPointer<Float>)
+        out: FloatBuffer,
+        deviceID: Int)
     {
-        let std = sqrt(2) / sqrt(Float(connectivityIO.0 + connectivityIO.1))
-        if #available(macOS 13.0, *)
+        var array = [Float](repeating: 0.0, count: nbElems)
+        array.withUnsafeMutableBufferPointer
         {
-            guard
-                var arrayDescriptor = BNNSNDArrayDescriptor(
-                    data: buffer,
-                    shape: .vector(nbElems)),
-                let randomNumberGenerator = BNNSCreateRandomGenerator(
-                    BNNSRandomGeneratorMethodAES_CTR,
-                    nil) else
+            ptr in
+            
+            let std = sqrt(2) / sqrt(Float(connectivityIO.0 + connectivityIO.1))
+            guard var arrayDescriptor = BNNSNDArrayDescriptor(
+                data: ptr,
+                shape: .vector(nbElems)),
+            let randomNumberGenerator = BNNSCreateRandomGenerator(
+                BNNSRandomGeneratorMethodAES_CTR,
+                nil) else
             {
                 fatalError()
             }
@@ -443,10 +393,7 @@ extension LayerWeightInit
             
             BNNSDestroyRandomGenerator(randomNumberGenerator)
         }
-        else
-        {
-            fatalError()
-        }
+        out.initialize(array: &array)
     }
     
     ///
@@ -479,24 +426,28 @@ extension LayerWeightInit
     ///     - nbElems: Number of weights to initialize.
     ///     - coeff: Multiplicative coefficient.
     ///     - connectivityIO: Number of input and output connections.
-    ///     - buffer: The buffer of values.
+    ///     - out: The output buffer.
+    ///     - deviceID: GPU device.
     ///
     static func KaimingUniform(
         nbElems: Int,
         coeff: Float,
         connectivityIO: (Int, Int),
-        buffer: UnsafeMutableBufferPointer<Float>)
+        out: FloatBuffer,
+        deviceID: Int)
     {
-        let bound = sqrt(3) * coeff / sqrt(Float(connectivityIO.0))
-        if #available(macOS 13.0, *)
+        var array = [Float](repeating: 0.0, count: nbElems)
+        array.withUnsafeMutableBufferPointer
         {
-            guard
-                var arrayDescriptor = BNNSNDArrayDescriptor(
-                    data: buffer,
-                    shape: .vector(nbElems)),
-                let randomNumberGenerator = BNNSCreateRandomGenerator(
-                    BNNSRandomGeneratorMethodAES_CTR,
-                    nil) else
+            ptr in
+            
+            let bound = sqrt(3) * coeff / sqrt(Float(connectivityIO.0))
+            guard var arrayDescriptor = BNNSNDArrayDescriptor(
+                data: ptr,
+                shape: .vector(nbElems)),
+            let randomNumberGenerator = BNNSCreateRandomGenerator(
+                BNNSRandomGeneratorMethodAES_CTR,
+                nil) else
             {
                 fatalError()
             }
@@ -510,10 +461,7 @@ extension LayerWeightInit
             
             BNNSDestroyRandomGenerator(randomNumberGenerator)
         }
-        else
-        {
-            fatalError()
-        }
+        out.initialize(array: &array)
     }
     
     ///
@@ -546,24 +494,28 @@ extension LayerWeightInit
     ///     - nbElems: Number of weights to initialize.
     ///     - coeff: Multiplicative coefficient.
     ///     - connectivityIO: Number of input and output connections.
-    ///     - buffer: The buffer of values.
+    ///     - out: The output buffer.
+    ///     - deviceID: GPU device.
     ///
     static func KaimingNormal(
         nbElems: Int,
         coeff: Float,
         connectivityIO: (Int, Int),
-        buffer: UnsafeMutableBufferPointer<Float>)
+        out: FloatBuffer,
+        deviceID: Int)
     {
-        let std = coeff / sqrt(Float(connectivityIO.0))
-        if #available(macOS 13.0, *)
+        var array = [Float](repeating: 0.0, count: nbElems)
+        array.withUnsafeMutableBufferPointer
         {
-            guard
-                var arrayDescriptor = BNNSNDArrayDescriptor(
-                    data: buffer,
-                    shape: .vector(nbElems)),
-                let randomNumberGenerator = BNNSCreateRandomGenerator(
-                    BNNSRandomGeneratorMethodAES_CTR,
-                    nil) else
+            ptr in
+            
+            let std = coeff / sqrt(Float(connectivityIO.0))
+            guard var arrayDescriptor = BNNSNDArrayDescriptor(
+                data: ptr,
+                shape: .vector(nbElems)),
+            let randomNumberGenerator = BNNSCreateRandomGenerator(
+                BNNSRandomGeneratorMethodAES_CTR,
+                nil) else
             {
                 fatalError()
             }
@@ -577,10 +529,7 @@ extension LayerWeightInit
             
             BNNSDestroyRandomGenerator(randomNumberGenerator)
         }
-        else
-        {
-            fatalError()
-        }
+        out.initialize(array: &array)
     }
 }
 
diff --git a/Sources/GrAIdient/Core/Model/Model.swift b/Sources/GrAIdient/Core/Model/Model.swift
index 5828020a..583c0a8b 100644
--- a/Sources/GrAIdient/Core/Model/Model.swift
+++ b/Sources/GrAIdient/Core/Model/Model.swift
@@ -948,7 +948,7 @@ public class Model: BaseModel
         if GrAI.Opti.GPU
         {
             let gNorm: Float? = gradientNorm != nil ?
-                                Float(gradientNorm!) : nil
+                Float(gradientNorm!) : nil
             try _kernel.algo.udpateGPU(layers: myLayers,
                                        gradientNorm: gNorm)
         }
diff --git a/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift b/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift
index 31f11259..e85cf693 100644
--- a/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift
+++ b/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift
@@ -170,7 +170,7 @@ public class OptimizerAlgorithm
             try clipGradientGPU(
                 layers: layers,
                 gradientNorm: gNorm,
-                normThreshold: _optimizer.params.normThreshold
+                normThreshold: Float(_optimizer.params.normThreshold)
             )
         }
     
@@ -233,7 +233,7 @@ public class OptimizerAlgorithm
                     let nbElems = buffers.g.nbElems
                     
                     let pNbElems: [UInt32] = [UInt32(nbElems)]
-                    let pFactor: [Float] = [Float(factor)]
+                    let pFactor: [Float] = [factor]
                     
                     let command = MetalKernel.get.createCommand(
                         "multiplyGradients", deviceID: layer.deviceID
@@ -303,22 +303,7 @@ public class OptimizerAlgorithm
                 
                 for buffers in layerUpdate.collectWeightsGPU()
                 {
-                    let buffer: UnsafeMutableBufferPointer<Float>
-                    if let g_p = buffers.g_p
-                    {
-                        MetalKernel.get.download([g_p])
-                        buffer = g_p.shared.buffer
-                    }
-                    else if let g_s = buffers.g_s
-                    {
-                        MetalKernel.get.download([g_s])
-                        buffer = g_s.buffer
-                    }
-                    else
-                    {
-                        fatalError("Unreachable.")
-                    }
-                    
+                    let buffer = buffers.g.download()
                     for i in 0..<buffers.g.nbElems
                     {
                         let partialGrad = buffer[i]
@@ -384,22 +369,7 @@ public class OptimizerAlgorithm
                 
                 for buffers in layerUpdate.collectWeightsGPU()
                 {
-                    let buffer: UnsafeMutableBufferPointer<Float>
-                    if let g_p = buffers.g_p
-                    {
-                        MetalKernel.get.download([g_p])
-                        buffer = g_p.shared.buffer
-                    }
-                    else if let g_s = buffers.g_s
-                    {
-                        MetalKernel.get.download([g_s])
-                        buffer = g_s.buffer
-                    }
-                    else
-                    {
-                        fatalError("Unreachable.")
-                    }
-                    
+                    let buffer = buffers.g.download()
                     for i in 0..<buffers.g.nbElems
                     {
                         gradients.append(buffer[i])
@@ -468,9 +438,9 @@ public class OptimizerAlgorithm
     ///
     func clipGradientGPU(layers: [Layer],
                          gradientNorm: Float,
-                         normThreshold: Double) throws
+                         normThreshold: Float) throws
     {
-        if gradientNorm > Float(normThreshold) {
+        if gradientNorm > normThreshold {
         for layer in layers
         {
             if let layerUpdate = layer as? LayerUpdate,
@@ -486,8 +456,8 @@ public class OptimizerAlgorithm
                     let nbElems = buffers.g.nbElems
                     
                     let pNbElems: [UInt32] = [UInt32(nbElems)]
-                    let pGradientNorm: [Float] = [Float(gradientNorm)]
-                    let pNormThreshold: [Float] = [Float(normThreshold)]
+                    let pGradientNorm: [Float] = [gradientNorm]
+                    let pNormThreshold: [Float] = [normThreshold]
                     
                     let command = MetalKernel.get.createCommand(
                         "clipGradients", deviceID: layer.deviceID
diff --git a/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift b/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift
index 1a9899d9..5e237d3c 100644
--- a/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift
+++ b/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift
@@ -294,12 +294,12 @@ class AdamOptimizer: OptimizerImpl
     override func stepGPU(_ weights: IWeightBuffers)
     {
         let nbElems = weights.nbElems
-        let t = Double(_kernel.params.t)
+        let t = Float(_kernel.params.t)
         
         let pNbElems: [UInt32] = [UInt32(nbElems)]
         let pAlpha: [Float] = [Float(alpha)]
         let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0]
-        let pT: [Float] = [Float(t)]
+        let pT: [Float] = [t]
         
         let command = MetalKernel.get.createCommand(
             "weightsAdam", deviceID: weights.deviceID
@@ -366,12 +366,12 @@ class AMSGradOptimizer: OptimizerImpl
     override func stepGPU(_ weights: IWeightBuffers)
     {
         let nbElems = weights.nbElems
-        let t = Double(_kernel.params.t)
+        let t = Float(_kernel.params.t)
         
         let pNbElems: [UInt32] = [UInt32(nbElems)]
         let pAlpha: [Float] = [Float(alpha)]
         let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0]
-        let pT: [Float] = [Float(t)]
+        let pT: [Float] = [t]
         
         let command = MetalKernel.get.createCommand(
             "weightsAMSGrad", deviceID: weights.deviceID
@@ -449,12 +449,12 @@ class AdamRectifiedOptimizer: OptimizerImpl
     override func stepGPU(_ weights: IWeightBuffers)
     {
         let nbElems = weights.nbElems
-        let t = Double(_kernel.params.t)
+        let t = Float(_kernel.params.t)
         
         let pNbElems: [UInt32] = [UInt32(nbElems)]
         let pAlpha: [Float] = [Float(alpha)]
         let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0]
-        let pT: [Float] = [Float(t)]
+        let pT: [Float] = [t]
         
         let command = MetalKernel.get.createCommand(
             "weightsAdamRectified", deviceID: weights.deviceID
@@ -583,12 +583,12 @@ class AdaBoundOptimizer: BoundOptimizer
     override func stepGPU(_ weights: IWeightBuffers)
     {
         let nbElems = weights.nbElems
-        let t = Double(_kernel.params.t)
+        let t = Float(_kernel.params.t)
         
         let pNbElems: [UInt32] = [UInt32(nbElems)]
         let pAlpha: [Float] = [Float(alpha)]
         let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0]
-        let pT: [Float] = [Float(t)]
+        let pT: [Float] = [t]
         let pLowerBound: [Float] = [Float(lowerBound!)]
         let pUpperBound: [Float] = [Float(upperBound!)]
         
@@ -667,12 +667,12 @@ class AMSBoundOptimizer: BoundOptimizer
     override func stepGPU(_ weights: IWeightBuffers)
     {
         let nbElems = weights.nbElems
-        let t = Double(_kernel.params.t)
+        let t = Float(_kernel.params.t)
         
         let pNbElems: [UInt32] = [UInt32(nbElems)]
         let pAlpha: [Float] = [Float(alpha)]
         let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0]
-        let pT: [Float] = [Float(t)]
+        let pT: [Float] = [t]
         let pLowerBound: [Float] = [Float(lowerBound!)]
         let pUpperBound: [Float] = [Float(upperBound!)]
         
diff --git a/Sources/GrAIdient/Core/State/Weights.swift b/Sources/GrAIdient/Core/State/Weights.swift
index 03e2b610..a45053dc 100644
--- a/Sources/GrAIdient/Core/State/Weights.swift
+++ b/Sources/GrAIdient/Core/State/Weights.swift
@@ -27,10 +27,10 @@ public protocol IWeightArrays
 }
 
 /// Arrays needed to update the weights.
-class WeightArrays: IWeightArrays
+public class WeightArrays: IWeightArrays
 {
     /// Number of elements in the different arrays.
-    let nbElems: Int
+    public let nbElems: Int
     
     var _w: [Double] = []
     var _g: [Double] = []
@@ -49,7 +49,7 @@ class WeightArrays: IWeightArrays
     }
     
     /// Weights array: the array to update.
-    var w: [Double]
+    public var w: [Double]
     {
         get {
             if _w.count == 0
@@ -69,7 +69,7 @@ class WeightArrays: IWeightArrays
         }
     }
     /// Gradients array.
-    var g: [Double]
+    public var g: [Double]
     {
         get {
             if _g.count == 0
@@ -89,7 +89,7 @@ class WeightArrays: IWeightArrays
         }
     }
     /// Momentum array.
-    var m: [Double]
+    public var m: [Double]
     {
         get {
             if _m.count == 0
@@ -109,7 +109,7 @@ class WeightArrays: IWeightArrays
         }
     }
     /// Velocity array.
-    var v: [Double]
+    public var v: [Double]
     {
         get {
             if _v.count == 0
@@ -129,7 +129,7 @@ class WeightArrays: IWeightArrays
         }
     }
     /// Veclocity normalized array.
-    var vHat: [Double]
+    public var vHat: [Double]
     {
         get {
             if _vHat.count == 0
@@ -150,7 +150,7 @@ class WeightArrays: IWeightArrays
     }
     
     /// Clean the momentum..., preserving the weights.
-    func reset()
+    public func reset()
     {
         _g = []
         _m = []
diff --git a/Sources/GrAIdient/GrAI.swift b/Sources/GrAIdient/GrAI.swift
index ae370274..7ead7164 100644
--- a/Sources/GrAIdient/GrAI.swift
+++ b/Sources/GrAIdient/GrAI.swift
@@ -70,6 +70,68 @@ public class GrAI
         }
     }
     
+    /// Namespace for precision settings.
+    public class Precision
+    {
+        /// Get/Set precision.
+        public static var double: Bool
+        {
+            get {
+                return getCtx.precision == PrecisionMode.Double
+            }
+            set {
+                if newValue && GrAI.Opti.CPU
+                {
+                    getCtx.precision = PrecisionMode.Double
+                }
+                else if newValue
+                {
+                    fatalError(
+                        "Cannot set double precision with GPU optimization."
+                    )
+                }
+            }
+        }
+        /// Get/Set precision.
+        public static var float: Bool
+        {
+            get {
+                return getCtx.precision == PrecisionMode.Float
+            }
+            set {
+                if newValue && GrAI.Opti.GPU
+                {
+                    getCtx.precision = PrecisionMode.Float
+                }
+                else if newValue
+                {
+                    fatalError(
+                        "Cannot set float precision with CPU optimization."
+                    )
+                }
+            }
+        }
+        /// Get/Set precision.
+        public static var float16: Bool
+        {
+            get {
+                return getCtx.precision == PrecisionMode.Float16
+            }
+            set {
+                if newValue && GrAI.Opti.GPU
+                {
+                    getCtx.precision = PrecisionMode.Float16
+                }
+                else if newValue
+                {
+                    fatalError(
+                        "Cannot set float precision with CPU optimization."
+                    )
+                }
+            }
+        }
+    }
+    
     /// Namespace for gradient settings.
     public class Gradient
     {
@@ -346,6 +408,14 @@ public class GrAI
     }
 }
 
+/// Precision mode.
+public enum PrecisionMode
+{
+    case Double
+    case Float
+    case Float16
+}
+
 /// A global context with stored variables.
 fileprivate class GrAIContext
 {
@@ -370,6 +440,12 @@ fileprivate class GrAIContext
         case GPU
     }
     
+    //--------------------------------------------------------------------------
+    // PRECISION
+    //--------------------------------------------------------------------------
+    /// Precision variable.
+    var precision = PrecisionMode.Float
+    
     /// Used to select GPU device.
     var gpuNamedPriority = [String]()
     
diff --git a/Sources/GrAIdient/Layer1D/Activation1D.swift b/Sources/GrAIdient/Layer1D/Activation1D.swift
index 1afffaae..79fccd50 100644
--- a/Sources/GrAIdient/Layer1D/Activation1D.swift
+++ b/Sources/GrAIdient/Layer1D/Activation1D.swift
@@ -16,7 +16,7 @@ public class Activation1D: Layer1D
     /// used in the GPU execution context.
     /// Shape ~ (batch, nbNeurons).
     ///
-    var _tmp: MetalPrivateBuffer<Float>! = nil
+    var _tmp: FloatBuffer! = nil
     
     /// Get coefficient (depending on activation function) to apply during the weights initialization.
     public var coeffInitWeights: Float
diff --git a/Sources/GrAIdient/Layer1D/BCE1D.swift b/Sources/GrAIdient/Layer1D/BCE1D.swift
index da842382..8e3bdedc 100644
--- a/Sources/GrAIdient/Layer1D/BCE1D.swift
+++ b/Sources/GrAIdient/Layer1D/BCE1D.swift
@@ -207,7 +207,7 @@ public class BCE1D: LayerOutput1D
     /// - Returns: The loss value.
     ///
     public func getLossGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws -> Float
     {
@@ -233,9 +233,8 @@ public class BCE1D: LayerOutput1D
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -367,7 +366,7 @@ public class BCE1D: LayerOutput1D
     ///     - nbNeurons: Number of neurons.
     ///
     public func lossDerivativeGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift b/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift
index 237d3da3..79ff2e9d 100644
--- a/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift
+++ b/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift
@@ -230,7 +230,7 @@ public class BCESigmoid1D: LayerOutput1D
     /// - Returns: The loss value.
     ///
     public func getLossGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws -> Float
     {
@@ -256,9 +256,8 @@ public class BCESigmoid1D: LayerOutput1D
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -386,7 +385,7 @@ public class BCESigmoid1D: LayerOutput1D
     ///     - nbNeurons: Number of neurons.
     ///
     public func lossDerivativeGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer1D/Base/Layer1D.swift b/Sources/GrAIdient/Layer1D/Base/Layer1D.swift
index 5e45c37f..ce2ab089 100644
--- a/Sources/GrAIdient/Layer1D/Base/Layer1D.swift
+++ b/Sources/GrAIdient/Layer1D/Base/Layer1D.swift
@@ -15,12 +15,12 @@ open class Layer1D: Layer
     /// Output buffer (result of the forward pass) used in the GPU execution context.
     /// Shape ~ (batch, nbNeurons).
     ///
-    public var outs: MetalPrivateBuffer<Float>! = nil
+    public var outs: FloatBuffer! = nil
     ///
     /// Gradient buffer (result of the backward pass) used in the GPU execution context.
     /// Shape ~ (batch, nbNeurons).
     ///
-    public var delta: MetalPrivateBuffer<Float>! = nil
+    public var delta: FloatBuffer! = nil
     
     /// Number of neurons.
     public let nbNeurons: Int
@@ -138,8 +138,8 @@ open class Layer1D: Layer
     {
         if outs == nil
         {
-            outs = MetalPrivateBuffer<Float>(
-                batchSize * nbNeurons, deviceID: deviceID
+            outs = FloatBuffer(
+                nbElems: batchSize * nbNeurons, deviceID: deviceID
             )
         }
         else if batchSize <= 0 || batchSize > outs.nbElems / nbNeurons
@@ -159,8 +159,8 @@ open class Layer1D: Layer
         {
             if delta == nil
             {
-                delta = MetalPrivateBuffer<Float>(
-                    batchSize * nbNeurons, deviceID: deviceID
+                delta = FloatBuffer(
+                    nbElems: batchSize * nbNeurons, deviceID: deviceID
                 )
             }
             else if batchSize <= 0 || batchSize > delta.nbElems / nbNeurons
@@ -194,9 +194,8 @@ open class Layer1D: Layer
     public func getOutsGPU<T: BinaryFloatingPoint>(elem: Int) -> [T]
     {
         var outs = [T]()
-        MetalKernel.get.download([self.outs])
+        let outsPtr = self.outs.download()
         
-        let outsPtr = self.outs.shared.buffer
         for depth in 0..<nbNeurons
         {
             let offset = depth + nbNeurons * elem
@@ -243,9 +242,8 @@ open class Layer1D: Layer
         }
         
         var delta = [T]()
-        MetalKernel.get.download([self.delta])
+        let deltaPtr = self.delta.download()
         
-        let deltaPtr = self.delta.shared.buffer
         for depth in 0..<nbNeurons
         {
             let offset = depth + nbNeurons * elem
diff --git a/Sources/GrAIdient/Layer1D/Base/LayerInput1D.swift b/Sources/GrAIdient/Layer1D/Base/LayerInput1D.swift
index fbc22d41..835c1ead 100644
--- a/Sources/GrAIdient/Layer1D/Base/LayerInput1D.swift
+++ b/Sources/GrAIdient/Layer1D/Base/LayerInput1D.swift
@@ -105,20 +105,20 @@ open class LayerInput1D: Layer1D
         try checkStateForwardGPU(batchSize: batchSize)
         
         // Wait for previous loop to end to avoid race condition with
-        // didModifyRange in the following example:
+        // download in the following example:
         // FullyConnected.backwardWeightsGPU accesses layerPrev.outs.
-        MetalKernel.get.download([outs])
+        _ = outs.download()
         
-        let outsPtr = outs.shared.buffer
+        var buffer = [Float](repeating: 0.0, count: batchSize * nbNeurons)
         for elem in 0..<batchSize
         {
             for depth in 0..<nbNeurons
             {
                 let offset = depth + nbNeurons * elem
-                outsPtr[offset] = Float(data[elem][depth])
+                buffer[offset] = Float(data[elem][depth])
             }
         }
-        MetalKernel.get.upload([outs])
+        outs.initialize(array: &buffer)
     }
     
     ///
@@ -132,7 +132,7 @@ open class LayerInput1D: Layer1D
     ///     - nbNeurons: Number of neurons.
     ///
     public func checkInputGPU(
-        _ data: MetalPrivateBuffer<Float>,
+        _ data: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift b/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift
index 66ef7969..2479d066 100644
--- a/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift
+++ b/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift
@@ -15,13 +15,13 @@ open class LayerOutput1D: Layer1D
     /// Ground truth buffer in the GPU execution context.
     /// Shape ~ (batch, nbNeurons).
     ///
-    public internal(set) var groundTruth: MetalSharedBuffer<Float>! = nil
+    public internal(set) var groundTruth: FloatBuffer! = nil
     
     ///
     /// Loss buffer in the GPU execution context.
     /// Shape ~ (batch,).
     ///
-    public internal(set) var loss: MetalSharedBuffer<Float>! = nil
+    public internal(set) var loss: FloatBuffer! = nil
     
     private enum Keys: String, CodingKey
     {
@@ -147,9 +147,10 @@ open class LayerOutput1D: Layer1D
         
         if self.groundTruth == nil
         {
-            self.groundTruth = MetalSharedBuffer<Float>(
-                batchSize * nbNeurons,
-                deviceID: deviceID
+            self.groundTruth = FloatBuffer(
+                nbElems: batchSize * nbNeurons,
+                deviceID: deviceID,
+                shared: true
             )
         }
         else if batchSize <= 0 ||
@@ -158,7 +159,7 @@ open class LayerOutput1D: Layer1D
             throw LayerError.BatchSize
         }
         
-        let bufferPtr = self.groundTruth.buffer
+        var buffer = [Float](repeating: 0.0, count: batchSize * nbNeurons)
         for (i, dataI) in groundTruth.enumerated()
         {
             if dataI.count != nbNeurons
@@ -167,10 +168,10 @@ open class LayerOutput1D: Layer1D
             }
             for (j, dataIJ) in dataI.enumerated()
             {
-                bufferPtr[j + i * nbNeurons] = Float(dataIJ)
+                buffer[j + i * nbNeurons] = Float(dataIJ)
             }
         }
-        MetalKernel.get.upload([self.groundTruth])
+        self.groundTruth.initialize(array: &buffer)
     }
     
     ///
@@ -184,7 +185,7 @@ open class LayerOutput1D: Layer1D
     ///     - nbNeurons: Number of neurons.
     ///
     public func checkGroundTruthGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws
     {
@@ -211,7 +212,9 @@ open class LayerOutput1D: Layer1D
     {
         if loss == nil
         {
-            loss = MetalSharedBuffer<Float>(batchSize, deviceID: deviceID)
+            loss = FloatBuffer(
+                nbElems: batchSize, deviceID: deviceID, shared: true
+            )
         }
         else if batchSize > loss.nbElems
         {
diff --git a/Sources/GrAIdient/Layer1D/Concat1D.swift b/Sources/GrAIdient/Layer1D/Concat1D.swift
index f163a8d5..afa46c15 100644
--- a/Sources/GrAIdient/Layer1D/Concat1D.swift
+++ b/Sources/GrAIdient/Layer1D/Concat1D.swift
@@ -146,9 +146,10 @@ public class Concat1D: LayerMerge1D
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! Layer1D).outs])
+            buffersPrev.append((_layersPrev[num] as! Layer1D).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -190,7 +191,7 @@ public class Concat1D: LayerMerge1D
         var curElem = 0
         for num in 0..<_layersPrev.count
         {
-            let outsPrevPtr = (_layersPrev[num] as! Layer1D).outs.shared.buffer
+            let outsPrevPtr = buffersPrev[num]
             let neuronsPrev = (_layersPrev[num] as! Layer1D).neurons
             let nbNeurons = neuronsPrev.nbElems
             
diff --git a/Sources/GrAIdient/Layer1D/Constant1D.swift b/Sources/GrAIdient/Layer1D/Constant1D.swift
index 0c5f4bae..8976a21f 100644
--- a/Sources/GrAIdient/Layer1D/Constant1D.swift
+++ b/Sources/GrAIdient/Layer1D/Constant1D.swift
@@ -24,7 +24,7 @@ public class Constant1D: Layer1D, LayerUpdate
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, nbNeurons).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -64,12 +64,7 @@ public class Constant1D: Layer1D, LayerUpdate
             {
                 return _weightsList
             }
-            
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-        
-            return weightsTmp
+            return _wBuffers.w.download()
         }
         set {
             _weightsList = newValue
@@ -258,19 +253,16 @@ public class Constant1D: Layer1D, LayerUpdate
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
         if _weightsList.count != 0
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: nbNeurons
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
+        }
+        else
+        {
+            _wBuffers.w.initialize()
         }
-        _weightsList = []
         
-        MetalKernel.get.upload([_wBuffers.w_p!])
+        _weightsList = []
         _wDeltaWeights = nil
     }
     
@@ -287,7 +279,7 @@ public class Constant1D: Layer1D, LayerUpdate
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * nbNeurons, deviceID: deviceID
             )
         }
@@ -348,8 +340,7 @@ public class Constant1D: Layer1D, LayerUpdate
             neurons.get(depth)!.initGC(batchSize: batchSize, nbGC: newGC)
         }
         
-        MetalKernel.get.download([_wBuffers.w_p!])
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
+        let weightsPtr = _wBuffers.w.download()
     
         for batch in 0..<batchSize {
         for DEPTH in 0..<nbNeurons {
@@ -548,8 +539,7 @@ public class Constant1D: Layer1D, LayerUpdate
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wDeltaWeights])
-        let deltaWeightsPtr = _wDeltaWeights.shared.buffer
+        let deltaWeightsPtr = _wDeltaWeights.download()
         
         for depth in 0..<nbNeurons
         {
@@ -595,8 +585,7 @@ public class Constant1D: Layer1D, LayerUpdate
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wBuffers.g_p!])
-        let deltaWeightsPtr = _wBuffers.g_p!.shared.buffer
+        let deltaWeightsPtr = _wBuffers.g.download()
         
         for i in 0..<_wBuffers.nbElems
         {
diff --git a/Sources/GrAIdient/Layer1D/DotProduct1D.swift b/Sources/GrAIdient/Layer1D/DotProduct1D.swift
index 49a941ee..8c58b5e7 100644
--- a/Sources/GrAIdient/Layer1D/DotProduct1D.swift
+++ b/Sources/GrAIdient/Layer1D/DotProduct1D.swift
@@ -201,11 +201,6 @@ public class DotProduct1D: LayerMerge1D
     {
         try checkStateCPU(batchSize: batchSize)
         
-        for num in 0..<_layersPrev.count
-        {
-            MetalKernel.get.download([(_layersPrev[num] as! Layer1D).outs])
-        }
-        
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
         
         var nbGC = nbSameElems
@@ -240,8 +235,8 @@ public class DotProduct1D: LayerMerge1D
             }
         }}
         
-        let buffer1 = (_layersPrev[0] as! Layer1D).outs.shared.buffer
-        let buffer2 = (_layersPrev[1] as! Layer1D).outs.shared.buffer
+        let buffer1 = (_layersPrev[0] as! Layer1D).outs.download()
+        let buffer2 = (_layersPrev[1] as! Layer1D).outs.download()
         
         for batch in 0..<batchSize {
         var offset = nbSameElems
diff --git a/Sources/GrAIdient/Layer1D/FullyConnected.swift b/Sources/GrAIdient/Layer1D/FullyConnected.swift
index 8da7c22b..861fa2ec 100644
--- a/Sources/GrAIdient/Layer1D/FullyConnected.swift
+++ b/Sources/GrAIdient/Layer1D/FullyConnected.swift
@@ -30,23 +30,23 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
     /// Buffer of weights.
     /// Shape ~ (nbNeurons, nbNeuronsPrev).
     ///
-    var _wBuffers: IWeightBuffers! = nil
+    var _wBuffers: WeightBuffers! = nil
     ///
     /// Buffer of biases.
     /// Shape ~ (nbNeurons,).
     ///
-    var _bBuffers: IWeightBuffers! = nil
+    var _bBuffers: WeightBuffers! = nil
     
     ///
     /// Buffer of gradients per sample for weights.
     /// Shape ~ (batch, nbNeurons, nbNeuronsPrev).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     ///
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, nbNeurons).
     /// 
-    var _bDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _bDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -105,7 +105,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
     }
     
     /// Output buffer of previous layer.
-    var outsPrev: MetalPrivateBuffer<Float>
+    var outsPrev: FloatBuffer
     {
         get {
             if let layerPrev = self.layerPrev as? Layer1D
@@ -124,7 +124,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
     }
     
     /// Gradient buffer of previous layer.
-    var deltaPrev: MetalPrivateBuffer<Float>?
+    var deltaPrev: FloatBuffer?
     {
         get {
             if let layerPrev = self.layerPrev as? Layer1D
@@ -199,14 +199,10 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
                 return _weightsList
             }
             
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-            
+            var weightsTmp = _wBuffers.w.download()
             if _updateBiases
             {
-                MetalKernel.get.download([_bBuffers.w_p!])
-                weightsTmp += _bBuffers.w_p!.shared.array
+                weightsTmp += _bBuffers.w.download()
             }
             return weightsTmp
         }
@@ -576,35 +572,24 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
-        let biasesPtr = _bBuffers.w_p!.shared.buffer
-        
+        _bBuffers.w.initialize()
         if _weightsList.count == 0
         {
-            generateWeightsList(buffer: weightsPtr)
+            generateWeightsList(out: _wBuffers.w, deviceID: deviceID)
         }
         else
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: weightHeight * weightWidth
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
             if _updateBiases
             {
-                copyFloatArrayToBuffer(
+                _bBuffers.w.initialize(
                     array: &_weightsList,
-                    buffer: biasesPtr,
-                    start: weightHeight * weightWidth,
-                    nbElems: weightHeight
+                    start: weightHeight * weightWidth
                 )
             }
         }
-        _weightsList = []
-        
-        MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!])
         
+        _weightsList = []
         _wDeltaWeights = nil
         _bDeltaWeights = nil
     }
@@ -622,13 +607,13 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * nbNeurons * weightWidth, deviceID: deviceID
             )
             
             if _updateBiases
             {
-                _bDeltaWeights = MetalPrivateBuffer<Float>(
+                _bDeltaWeights = FloatBuffer(nbElems: 
                     batchSize * nbNeurons, deviceID: deviceID
                 )
             }
@@ -771,11 +756,8 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
                 neurons.get(depth)!.initGC(batchSize: batchSize, nbGC: newGC)
             }
             
-            MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!])
-            MetalKernel.get.download([outsPrev])
-            
-            let weightsPtr = _wBuffers.w_p!.shared.buffer
-            let biasesPtr = _bBuffers.w_p!.shared.buffer
+            let weightsPtr = _wBuffers.w.download()
+            let biasesPtr = _bBuffers.w.download()
             
             let neuronsPrev = self.neuronsPrev
             for batch in 0..<batchSize {
@@ -797,7 +779,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
                 }
             }}
             
-            let outsPrevPtr = outsPrev.shared.buffer
+            let outsPrevPtr = outsPrev.download()
             
             for batch in 0..<batchSize {
             for I in 0..<nbNeurons {
@@ -1206,7 +1188,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
     /// Get the weights in the CPU execution context.
     public func collectWeightsCPU() -> [IWeightArrays]
     {
-        var weights = [IWeightArrays]()
+        var weights = [WeightArrays]()
         weights.append(_wArrays)
         if _updateBiases
         {
@@ -1248,8 +1230,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wDeltaWeights])
-        var deltaWeightsPtr = _wDeltaWeights.shared.buffer
+        var deltaWeightsPtr = _wDeltaWeights.download()
         
         let offsetStart = elem * nbNeurons * weightWidth
         for depth in 0..<nbNeurons {
@@ -1264,8 +1245,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
         
         if _updateBiases
         {
-            MetalKernel.get.download([_bDeltaWeights])
-            deltaWeightsPtr = _bDeltaWeights.shared.buffer
+            deltaWeightsPtr = _bDeltaWeights.download()
             
             for depth in 0..<nbNeurons
             {
@@ -1320,8 +1300,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wBuffers.g_p!])
-        var deltaWeightsPtr = _wBuffers.g_p!.shared.buffer
+        var deltaWeightsPtr = _wBuffers.g.download()
         
         for i in 0..<_wBuffers.nbElems
         {
@@ -1329,8 +1308,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit
         }
         if _updateBiases
         {
-            MetalKernel.get.download([_bBuffers.g_p!])
-            deltaWeightsPtr = _bBuffers.g_p!.shared.buffer
+            deltaWeightsPtr = _bBuffers.g.download()
             
             for i in 0..<_bBuffers.nbElems
             {
diff --git a/Sources/GrAIdient/Layer1D/Input1D.swift b/Sources/GrAIdient/Layer1D/Input1D.swift
index e7976ea2..536f85a7 100644
--- a/Sources/GrAIdient/Layer1D/Input1D.swift
+++ b/Sources/GrAIdient/Layer1D/Input1D.swift
@@ -61,9 +61,9 @@ class InputArrays1D: InputArrays<Layer1D>, IWeightArrays
 
 /// GPU buffers needed to update the inputs of a layer.
 class InputBuffers1D: InputBuffers<Layer1D>, IWeightBuffers
-{
+{    
     /// Inputs buffer: the buffer to be update.
-    var w: MetalBuffer<Float>
+    var w: FloatBuffer
     {
         get {
             return _layer.outs
@@ -71,7 +71,7 @@ class InputBuffers1D: InputBuffers<Layer1D>, IWeightBuffers
     }
     
     /// Gradients buffer.
-    var g: MetalBuffer<Float>
+    var g: FloatBuffer
     {
         get {
             return _layer.delta
@@ -304,7 +304,7 @@ public class Input1D: LayerInput1D, LayerUpdate
     ///     - nbNeurons: Number of neurons.
     ///
     public func setDataGPU(
-        _ data: MetalPrivateBuffer<Float>,
+        _ data: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer1D/LinearError1D.swift b/Sources/GrAIdient/Layer1D/LinearError1D.swift
index 6549eeea..3ce12e28 100644
--- a/Sources/GrAIdient/Layer1D/LinearError1D.swift
+++ b/Sources/GrAIdient/Layer1D/LinearError1D.swift
@@ -201,7 +201,7 @@ public class LinearError1D: LayerOutput1D
     /// - Returns: The loss value.
     ///
     public func getLossGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int) throws -> Float
     {
         try checkLossGPU(batchSize: batchSize)
@@ -225,9 +225,8 @@ public class LinearError1D: LayerOutput1D
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
diff --git a/Sources/GrAIdient/Layer1D/MSE1D.swift b/Sources/GrAIdient/Layer1D/MSE1D.swift
index baeab33f..61ef4479 100644
--- a/Sources/GrAIdient/Layer1D/MSE1D.swift
+++ b/Sources/GrAIdient/Layer1D/MSE1D.swift
@@ -203,7 +203,7 @@ public class MSE1D: LayerOutput1D
     /// - Returns: The loss value.
     ///
     public func getLossGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws -> Float
     {
@@ -229,9 +229,8 @@ public class MSE1D: LayerOutput1D
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -350,7 +349,7 @@ public class MSE1D: LayerOutput1D
     ///     - nbNeurons: Number of neurons.
     ///
     public func lossDerivativeGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbNeurons: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer1D/Sum1D.swift b/Sources/GrAIdient/Layer1D/Sum1D.swift
index 685b8416..01c66d44 100644
--- a/Sources/GrAIdient/Layer1D/Sum1D.swift
+++ b/Sources/GrAIdient/Layer1D/Sum1D.swift
@@ -155,9 +155,10 @@ public class Sum1D: LayerMerge1D
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! Layer1D).outs])
+            buffersPrev.append((_layersPrev[num] as! Layer1D).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -197,8 +198,7 @@ public class Sum1D: LayerMerge1D
             var sum = 0.0
             for num in 0..<_layersPrev.count
             {
-                let outsPrevPtr =
-                    (_layersPrev[num] as! Layer1D).outs.shared.buffer
+                let outsPrevPtr = buffersPrev[num]
                 let neuronsPrev =
                     (_layersPrev[num] as! Layer1D).neurons
                 
diff --git a/Sources/GrAIdient/Layer2D/Activation2D.swift b/Sources/GrAIdient/Layer2D/Activation2D.swift
index fb57db0c..8b210d42 100644
--- a/Sources/GrAIdient/Layer2D/Activation2D.swift
+++ b/Sources/GrAIdient/Layer2D/Activation2D.swift
@@ -16,7 +16,7 @@ public class Activation2D: Layer2D
     /// used in the GPU execution context.
     /// Shape ~ (batch, nbChannels, height, width).
     ///
-    var _tmp: MetalPrivateBuffer<Float>! = nil
+    var _tmp: FloatBuffer! = nil
     
     /// Get coefficient (depending on activation function) to apply during the weights initialization.
     public var coeffInitWeights: Float
diff --git a/Sources/GrAIdient/Layer2D/AdaIN.swift b/Sources/GrAIdient/Layer2D/AdaIN.swift
index 2fd50d6c..c1f6beb6 100644
--- a/Sources/GrAIdient/Layer2D/AdaIN.swift
+++ b/Sources/GrAIdient/Layer2D/AdaIN.swift
@@ -362,10 +362,9 @@ public class AdaIN: LayerMerge2D
         
         let layerFirst = _layersPrev.first as! Layer2D
         let layerLast = _layersPrev.last as! Layer1D
-        MetalKernel.get.download([layerFirst.outs, layerLast.outs])
         
-        let bufferOuts = layerFirst.outs.shared.buffer
-        let bufferStyles = layerLast.outs.shared.buffer
+        let bufferOuts = layerFirst.outs.download()
+        let bufferStyles = layerLast.outs.download()
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
         
@@ -663,7 +662,7 @@ public class AdaIN: LayerMerge2D
     /// - Returns: The outputs.
     ///
     func getOutsPrev(
-        buffer: UnsafeMutableBufferPointer<Float>,
+        buffer: [Float],
         depth: Int,
         batch: Int) -> [Double]
     {
@@ -692,7 +691,7 @@ public class AdaIN: LayerMerge2D
     /// - Returns: The output.
     ///
     func getOutStyle(
-        buffer: UnsafeMutableBufferPointer<Float>,
+        buffer: [Float],
         depth: Int,
         batch: Int) -> Double
     {
diff --git a/Sources/GrAIdient/Layer2D/BCE2D.swift b/Sources/GrAIdient/Layer2D/BCE2D.swift
index 8b2b8010..cfcd5bc6 100644
--- a/Sources/GrAIdient/Layer2D/BCE2D.swift
+++ b/Sources/GrAIdient/Layer2D/BCE2D.swift
@@ -272,7 +272,7 @@ public class BCE2D: LayerOutput2D
     /// - Returns: The loss value.
     ///
     public func getLossGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws -> Float
     {
@@ -300,9 +300,8 @@ public class BCE2D: LayerOutput2D
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -491,7 +490,7 @@ public class BCE2D: LayerOutput2D
     ///     - width: Width of each channel.
     ///
     public func lossDerivativeGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift b/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift
index d1104542..6c5396c0 100644
--- a/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift
+++ b/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift
@@ -315,7 +315,7 @@ public class BCESigmoid2D: LayerOutput2D
     /// - Returns: The loss value.
     ///
     public func getLossGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws -> Float
     {
@@ -343,9 +343,8 @@ public class BCESigmoid2D: LayerOutput2D
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -526,7 +525,7 @@ public class BCESigmoid2D: LayerOutput2D
     ///     - width: Width of each channel.
     ///
     public func lossDerivativeGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer2D/BN2D.swift b/Sources/GrAIdient/Layer2D/BN2D.swift
index f154a2c9..5847ccb7 100644
--- a/Sources/GrAIdient/Layer2D/BN2D.swift
+++ b/Sources/GrAIdient/Layer2D/BN2D.swift
@@ -533,8 +533,7 @@ public class BN2D: Activation2D, LayerUpdate, LayerWithActivation
                 }}}
             }}
             
-            MetalKernel.get.download([layerPrev.outs])
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = layerPrev.outs.download()
             
             // Prepare GC for norm weights: Ɣ and β.
             for batch in 0..<batchSize {
@@ -693,7 +692,7 @@ public class BN2D: Activation2D, LayerUpdate, LayerWithActivation
     /// Get the weights in the CPU execution context.
     public func collectWeightsCPU() -> [IWeightArrays]
     {
-        var weights = [IWeightArrays]()
+        var weights = [WeightArrays]()
         if let norm = self.norm
         {
             weights += norm.collectWeights()
diff --git a/Sources/GrAIdient/Layer2D/Base/Layer2D.swift b/Sources/GrAIdient/Layer2D/Base/Layer2D.swift
index fc95d9a3..e4af2a0b 100644
--- a/Sources/GrAIdient/Layer2D/Base/Layer2D.swift
+++ b/Sources/GrAIdient/Layer2D/Base/Layer2D.swift
@@ -15,12 +15,12 @@ open class Layer2D: Layer
     /// Output buffer (result of the forward pass) used in the GPU execution context.
     /// Shape ~ (batch, nbChannels, height, width).
     ///
-    public var outs: MetalPrivateBuffer<Float>! = nil
+    public var outs: FloatBuffer! = nil
     ///
     /// Gradient buffer (result of the backward pass) used in the GPU execution context.
     /// Shape ~ (batch, nbChannels, height, width).
     ///
-    public var delta: MetalPrivateBuffer<Float>! = nil
+    public var delta: FloatBuffer! = nil
     
     /// Number of channels.
     public let nbChannels: Int
@@ -192,8 +192,9 @@ open class Layer2D: Layer
     {
         if outs == nil
         {
-            outs = MetalPrivateBuffer<Float>(
-                batchSize * nbChannels * width * height, deviceID: deviceID
+            outs = FloatBuffer(
+                nbElems: batchSize * nbChannels * width * height,
+                deviceID: deviceID
             )
         }
         else if batchSize <= 0 ||
@@ -214,8 +215,9 @@ open class Layer2D: Layer
         {
             if delta == nil
             {
-                delta = MetalPrivateBuffer<Float>(
-                    batchSize * nbChannels * width * height, deviceID: deviceID
+                delta = FloatBuffer(
+                    nbElems: batchSize * nbChannels * width * height,
+                    deviceID: deviceID
                 )
             }
             else if batchSize <= 0 ||
@@ -251,9 +253,8 @@ open class Layer2D: Layer
     public func getOutsGPU<T: BinaryFloatingPoint>(elem: Int) -> [T]
     {
         var outs = [T]()
-        MetalKernel.get.download([self.outs])
+        let outsPtr = self.outs.download()
         
-        let outsPtr = self.outs.shared.buffer
         for depth in 0..<nbChannels
         {
             let offsetStart = (depth + nbChannels * elem) * height
@@ -307,9 +308,8 @@ open class Layer2D: Layer
         }
         
         var delta = [T]()
-        MetalKernel.get.download([self.delta])
+        let deltaPtr = self.delta.download()
         
-        let deltaPtr = self.delta.shared.buffer
         for depth in 0..<nbChannels
         {
             let offsetStart = (depth + nbChannels * elem) * height
diff --git a/Sources/GrAIdient/Layer2D/Base/LayerInput2D.swift b/Sources/GrAIdient/Layer2D/Base/LayerInput2D.swift
index 3d8caf2c..c723cfd6 100644
--- a/Sources/GrAIdient/Layer2D/Base/LayerInput2D.swift
+++ b/Sources/GrAIdient/Layer2D/Base/LayerInput2D.swift
@@ -137,11 +137,14 @@ open class LayerInput2D: Layer2D
         try checkStateForwardGPU(batchSize: batchSize)
         
         // Wait for previous loop to end to avoid race condition with
-        // didModifyRange in the following example:
+        // download in the following example:
         // Convolution.backwardWeightsGPU accesses layerPrev.outs.
-        MetalKernel.get.download([outs])
+        _ = outs.download()
+        
+        var buffer = [Float](
+            repeating: 0.0, count: batchSize * nbChannels * height * width
+        )
         
-        let outsPtr = outs.shared.buffer
         switch format
         {
         case .RGB:
@@ -157,7 +160,7 @@ open class LayerInput2D: Layer2D
                             (depth + nbChannels * elem) * height
                         let offsetSet = j + (offsetStartSet + i) * width
                         
-                        outsPtr[offsetSet] =
+                        buffer[offsetSet] =
                             Float(data[nbChannels * offsetGet + depth])
                     }
                 }}
@@ -173,12 +176,12 @@ open class LayerInput2D: Layer2D
                         let offsetStart = (depth + nbChannels * elem) * height
                         let offset = j + (offsetStart + i) * width
                         
-                        outsPtr[offset] = Float(data[offset])
+                        buffer[offset] = Float(data[offset])
                     }
                 }}
             }
         }
-        MetalKernel.get.upload([outs])
+        outs.initialize(array: &buffer)
     }
     
     ///
@@ -195,7 +198,7 @@ open class LayerInput2D: Layer2D
     ///     - format: The data format.
     ///
     public func checkInputGPU(
-        _ data: MetalPrivateBuffer<Float>,
+        _ data: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift b/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift
index c6d9fbd9..fcd11e8e 100644
--- a/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift
+++ b/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift
@@ -15,13 +15,13 @@ open class LayerOutput2D: Layer2D
     /// Ground truth buffer in the GPU execution context.
     /// Shape ~ (batch, nbChannels, height, width).
     ///
-    public internal(set) var groundTruth: MetalSharedBuffer<Float>! = nil
+    public internal(set) var groundTruth: FloatBuffer! = nil
     
     ///
     /// Loss buffer in the GPU execution context.
     /// Shape ~ (batch,).
     ///
-    public internal(set) var loss: MetalSharedBuffer<Float>! = nil
+    public internal(set) var loss: FloatBuffer! = nil
     
     private enum Keys: String, CodingKey
     {
@@ -157,9 +157,10 @@ open class LayerOutput2D: Layer2D
         
         if self.groundTruth == nil
         {
-            self.groundTruth = MetalSharedBuffer<Float>(
-                batchSize * nbChannels * height * width,
-                deviceID: deviceID
+            self.groundTruth = FloatBuffer(
+                nbElems: batchSize * nbChannels * height * width,
+                deviceID: deviceID,
+                shared: true
             )
         }
         else if batchSize <= 0 ||
@@ -168,7 +169,10 @@ open class LayerOutput2D: Layer2D
             throw LayerError.BatchSize
         }
         
-        let bufferPtr = self.groundTruth.buffer
+        var buffer = [Float](
+            repeating: 0.0, count: batchSize * nbChannels * height * width
+        )
+        
         switch format
         {
         case .RGB:
@@ -184,7 +188,7 @@ open class LayerOutput2D: Layer2D
                     let offsetSet = j + (offsetStart + i) * width
                     
                     let gt = groundTruth[nbChannels * offsetGet + depth]
-                    bufferPtr[offsetSet] = Float(gt)
+                    buffer[offsetSet] = Float(gt)
                 }}
             }}
         case .Neuron:
@@ -199,11 +203,11 @@ open class LayerOutput2D: Layer2D
                     let offset = j + (offsetStart + i) * width
                     
                     let gt = groundTruth[offset]
-                    bufferPtr[offset] = Float(gt)
+                    buffer[offset] = Float(gt)
                 }}
             }}
         }
-        MetalKernel.get.upload([self.groundTruth])
+        self.groundTruth.initialize(array: &buffer)
     }
     
     ///
@@ -219,7 +223,7 @@ open class LayerOutput2D: Layer2D
     ///     - width: Width of each channel.
     ///
     public func checkGroundTruthGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
@@ -248,7 +252,9 @@ open class LayerOutput2D: Layer2D
     {
         if loss == nil
         {
-            loss = MetalSharedBuffer<Float>(batchSize, deviceID: deviceID)
+            loss = FloatBuffer(
+                nbElems: batchSize, deviceID: deviceID, shared: true
+            )
         }
         else if batchSize <= 0 || batchSize > loss.nbElems
         {
diff --git a/Sources/GrAIdient/Layer2D/Concat2D.swift b/Sources/GrAIdient/Layer2D/Concat2D.swift
index 4a9a0e6c..17fdfd1a 100644
--- a/Sources/GrAIdient/Layer2D/Concat2D.swift
+++ b/Sources/GrAIdient/Layer2D/Concat2D.swift
@@ -168,9 +168,10 @@ public class Concat2D: LayerMerge2D
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! Layer2D).outs])
+            buffersPrev.append((_layersPrev[num] as! Layer2D).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -221,7 +222,7 @@ public class Concat2D: LayerMerge2D
         var curElem = 0
         for num in 0..<_layersPrev.count
         {
-            let outsPrevPtr = (_layersPrev[num] as! Layer2D).outs.shared.buffer
+            let outsPrevPtr = buffersPrev[num]
             let neuronsPrev = (_layersPrev[num] as! Layer2D).neurons
             let nbChannels = neuronsPrev.count
             
diff --git a/Sources/GrAIdient/Layer2D/Constant2D.swift b/Sources/GrAIdient/Layer2D/Constant2D.swift
index 0b65cf86..96d80aee 100644
--- a/Sources/GrAIdient/Layer2D/Constant2D.swift
+++ b/Sources/GrAIdient/Layer2D/Constant2D.swift
@@ -24,7 +24,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, nbChannels).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -64,12 +64,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate
             {
                 return _weightsList
             }
-            
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-        
-            return weightsTmp
+            return _wBuffers.w.download()
         }
         set {
             _weightsList = newValue
@@ -315,19 +310,16 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
         if _weightsList.count != 0
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: nbChannels
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
+        }
+        else
+        {
+            _wBuffers.w.initialize()
         }
-        _weightsList = []
         
-        MetalKernel.get.upload([_wBuffers.w_p!])
+        _weightsList = []
         _wDeltaWeights = nil
     }
     
@@ -344,7 +336,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * nbChannels, deviceID: deviceID
             )
         }
@@ -411,8 +403,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate
             neurons[depth].get(i, j)!.initGC(batchSize: batchSize, nbGC: newGC)
         }}}
         
-        MetalKernel.get.download([_wBuffers.w_p!])
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
+        let weightsPtr = _wBuffers.w.download()
     
         for batch in 0..<batchSize {
         for DEPTH in 0..<nbChannels {
@@ -620,8 +611,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wDeltaWeights])
-        let deltaWeightsPtr = _wDeltaWeights.shared.buffer
+        let deltaWeightsPtr = _wDeltaWeights.download()
         
         for depth in 0..<nbChannels
         {
@@ -667,8 +657,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wBuffers.g_p!])
-        let deltaWeightsPtr = _wBuffers.g_p!.shared.buffer
+        let deltaWeightsPtr = _wBuffers.g.download()
         
         for i in 0..<_wBuffers.nbElems
         {
diff --git a/Sources/GrAIdient/Layer2D/Convolution2D.swift b/Sources/GrAIdient/Layer2D/Convolution2D.swift
index c364c98e..9d9bb1cc 100644
--- a/Sources/GrAIdient/Layer2D/Convolution2D.swift
+++ b/Sources/GrAIdient/Layer2D/Convolution2D.swift
@@ -54,12 +54,12 @@ public class Convolution2D: BN2D, LayerWeightInit
     /// Buffer of gradients per sample for weights.
     /// Shape ~ (batch, nbWeights, kernel height, kernel width).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     ///
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, nbChannels).
     ///
-    var _bDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _bDeltaWeights: FloatBuffer! = nil
     
     /// Number of weight kernels.
     public let nbWeights: Int
@@ -184,14 +184,10 @@ public class Convolution2D: BN2D, LayerWeightInit
                 return _weightsList
             }
             
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-            
+            var weightsTmp = _wBuffers.w.download()
             if _updateBiases
             {
-                MetalKernel.get.download([_bBuffers.w_p!])
-                weightsTmp += _bBuffers.w_p!.shared.array
+                weightsTmp += _bBuffers.w.download()
             }
             return weightsTmp
         }
@@ -782,35 +778,24 @@ public class Convolution2D: BN2D, LayerWeightInit
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
-        let biasesPtr = _bBuffers.w_p!.shared.buffer
-        
+        _bBuffers.w.initialize()
         if _weightsList.count == 0
         {
-            generateWeightsList(buffer: weightsPtr)
+            generateWeightsList(out: _wBuffers.w, deviceID: deviceID)
         }
         else
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: nbWeights * weightHeight * weightWidth
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
             if _updateBiases
             {
-                copyFloatArrayToBuffer(
+                _bBuffers.w.initialize(
                     array: &_weightsList,
-                    buffer: biasesPtr,
-                    start: nbWeights * weightHeight * weightWidth,
-                    nbElems: nbChannels
+                    start: nbWeights * weightHeight * weightWidth
                 )
             }
         }
-        _weightsList = []
-        
-        MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!])
         
+        _weightsList = []
         _wDeltaWeights = nil
         _bDeltaWeights = nil
     }
@@ -828,14 +813,14 @@ public class Convolution2D: BN2D, LayerWeightInit
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * nbWeights * weightWidth * weightHeight,
                 deviceID: deviceID
             )
             
             if _updateBiases
             {
-                _bDeltaWeights = MetalPrivateBuffer<Float>(
+                _bDeltaWeights = FloatBuffer(nbElems: 
                     batchSize * nbChannels, deviceID: deviceID
                 )
             }
@@ -1071,11 +1056,8 @@ public class Convolution2D: BN2D, LayerWeightInit
                 }}
             }
             
-            MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!])
-            MetalKernel.get.download([layerPrev.outs])
-            
-            let weightsPtr = _wBuffers.w_p!.shared.buffer
-            let biasesPtr = _bBuffers.w_p!.shared.buffer
+            let weightsPtr = _wBuffers.w.download()
+            let biasesPtr = _bBuffers.w.download()
             
             let neuronsPrev = layerPrev.neurons
             let widthPrev = layerPrev.width
@@ -1115,7 +1097,7 @@ public class Convolution2D: BN2D, LayerWeightInit
                 }}
             }}}
             
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = layerPrev.outs.download()
             
             for batch in 0..<batchSize {
             for DEPTH in 0..<nbChannels {
@@ -1776,7 +1758,7 @@ public class Convolution2D: BN2D, LayerWeightInit
     /// Get the weights in the CPU execution context.
     public override func collectWeightsCPU() -> [IWeightArrays]
     {
-        var weights = [IWeightArrays]()
+        var weights = [WeightArrays]()
         weights += _wArrays
         if _updateBiases
         {
@@ -1826,8 +1808,7 @@ public class Convolution2D: BN2D, LayerWeightInit
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wDeltaWeights])
-        var deltaWeightsPtr = _wDeltaWeights.shared.buffer
+        var deltaWeightsPtr = _wDeltaWeights.download()
         
         let nbChannelsPrev = (self.layerPrev as! Layer2D).nbChannels
         let offsetStartGrid =
@@ -1853,8 +1834,7 @@ public class Convolution2D: BN2D, LayerWeightInit
         
         if _updateBiases
         {
-            MetalKernel.get.download([_bDeltaWeights])
-            deltaWeightsPtr = _bDeltaWeights.shared.buffer
+            deltaWeightsPtr = _bDeltaWeights.download()
             
             for depth in 0..<nbChannels
             {
@@ -1916,8 +1896,7 @@ public class Convolution2D: BN2D, LayerWeightInit
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wBuffers.g_p!])
-        var deltaWeightsPtr = _wBuffers.g_p!.shared.buffer
+        var deltaWeightsPtr = _wBuffers.g.download()
         
         for i in 0..<_wBuffers.nbElems
         {
@@ -1925,9 +1904,7 @@ public class Convolution2D: BN2D, LayerWeightInit
         }
         if _updateBiases
         {
-            MetalKernel.get.download([_bBuffers.g_p!])
-            deltaWeightsPtr = _bBuffers.g_p!.shared.buffer
-            
+            deltaWeightsPtr = _bBuffers.g.download()
             for i in 0..<_bBuffers.nbElems
             {
                 deltaWeights.append(T(deltaWeightsPtr[i]))
diff --git a/Sources/GrAIdient/Layer2D/Deconvolution2D.swift b/Sources/GrAIdient/Layer2D/Deconvolution2D.swift
index b9159b26..0017fe1b 100644
--- a/Sources/GrAIdient/Layer2D/Deconvolution2D.swift
+++ b/Sources/GrAIdient/Layer2D/Deconvolution2D.swift
@@ -491,11 +491,8 @@ public class Deconvolution2D: Convolution2D
                 }}
             }
             
-            MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!])
-            MetalKernel.get.download([layerPrev.outs])
-            
-            let weightsPtr = _wBuffers.w_p!.shared.buffer
-            let biasesPtr = _bBuffers.w_p!.shared.buffer
+            let weightsPtr = _wBuffers.w.download()
+            let biasesPtr = _bBuffers.w.download()
             
             let neuronsPrev = layerPrev.neurons
             let widthPrev = layerPrev.width
@@ -540,7 +537,7 @@ public class Deconvolution2D: Convolution2D
                 }}
             }}}
             
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = layerPrev.outs.download()
             
             for batch in 0..<batchSize {
             for DEPTH in 0..<nbChannels {
diff --git a/Sources/GrAIdient/Layer2D/Input2D.swift b/Sources/GrAIdient/Layer2D/Input2D.swift
index 343f8fef..fe951652 100644
--- a/Sources/GrAIdient/Layer2D/Input2D.swift
+++ b/Sources/GrAIdient/Layer2D/Input2D.swift
@@ -82,7 +82,7 @@ class InputArrays2D: InputArrays<Layer2D>, IWeightArrays
 class InputBuffers2D: InputBuffers<Layer2D>, IWeightBuffers
 {
     /// Inputs buffer: the buffer to be update.
-    var w: MetalBuffer<Float>
+    var w: FloatBuffer
     {
         get {
             return _layer.outs
@@ -90,7 +90,7 @@ class InputBuffers2D: InputBuffers<Layer2D>, IWeightBuffers
     }
     
     /// Gradients buffer.
-    var g: MetalBuffer<Float>
+    var g: FloatBuffer
     {
         get {
             return _layer.delta
@@ -397,7 +397,7 @@ public class Input2D: LayerInput2D, LayerResize, LayerUpdate
     ///     - width: Width of each channel.
     ///
     public func setDataGPU(
-        _ data: MetalPrivateBuffer<Float>,
+        _ data: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift b/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift
index 17ccbc4e..1585cdb6 100644
--- a/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift
+++ b/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift
@@ -457,8 +457,7 @@ public class InstanceNorm2D: Activation2D, LayerUpdate, LayerWithActivation
                 }}}
             }}
             
-            MetalKernel.get.download([layerPrev.outs])
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = layerPrev.outs.download()
             
             // Prepare GC for norm weights: Ɣ and β.
             for batch in 0..<batchSize {
@@ -617,7 +616,7 @@ public class InstanceNorm2D: Activation2D, LayerUpdate, LayerWithActivation
     /// Get the weights in the CPU execution context.
     public func collectWeightsCPU() -> [IWeightArrays]
     {
-        var weights = [IWeightArrays]()
+        var weights = [WeightArrays]()
         if let norm = self.norm
         {
             weights += norm.collectWeights()
diff --git a/Sources/GrAIdient/Layer2D/MSE2D.swift b/Sources/GrAIdient/Layer2D/MSE2D.swift
index 1cdf404f..75775063 100644
--- a/Sources/GrAIdient/Layer2D/MSE2D.swift
+++ b/Sources/GrAIdient/Layer2D/MSE2D.swift
@@ -268,7 +268,7 @@ public class MSE2D: LayerOutput2D
     /// - Returns: The loss value.
     ///
     public func getLossGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws -> Float
     {
@@ -296,9 +296,8 @@ public class MSE2D: LayerOutput2D
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -461,7 +460,7 @@ public class MSE2D: LayerOutput2D
     ///     - width: Width of each channel.
     ///
     public func lossDerivativeGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
diff --git a/Sources/GrAIdient/Layer2D/Multiply2D.swift b/Sources/GrAIdient/Layer2D/Multiply2D.swift
index d5d879ec..677bf228 100644
--- a/Sources/GrAIdient/Layer2D/Multiply2D.swift
+++ b/Sources/GrAIdient/Layer2D/Multiply2D.swift
@@ -14,10 +14,15 @@
 public class Multiply2D: LayerMerge2D
 {
     ///
-    /// List of output buffers.
+    /// List of output buffers for CPU usage.
     /// Shape ~ (batch, nbChannels, height, width).
     ///
-    var _otherOuts: [MetalBuffer<Float>] = []
+    var _otherOuts1: [[Double]] = []
+    ///
+    /// List of output buffers for GPU usage.
+    /// Shape ~ (batch, nbChannels, height, width).
+    ///
+    var _otherOuts2: [FloatBuffer] = []
     
     ///
     /// Create a layer with a 2D shape neural structure.
@@ -97,7 +102,7 @@ public class Multiply2D: LayerMerge2D
     public override func resetKernelCPU()
     {
         super.resetKernelCPU()
-        _otherOuts = []
+        _otherOuts1 = []
     }
     
     ///
@@ -108,7 +113,7 @@ public class Multiply2D: LayerMerge2D
     public override func resetKernelGPU()
     {
         super.resetKernelGPU()
-        _otherOuts = []
+        _otherOuts2 = []
     }
     
     ///
@@ -120,15 +125,14 @@ public class Multiply2D: LayerMerge2D
     {
         try super.checkStateCPU(batchSize: batchSize)
         
-        if _otherOuts.count == 0
+        if _otherOuts1.count == 0
         {
             for _ in 0..<_layersPrev.count
             {
-                let buffer = MetalSharedBuffer<Float>(
-                    batchSize * nbChannels * height * width,
-                    deviceID: deviceID
-                )
-                _otherOuts.append(buffer)
+                _otherOuts1.append([Double](
+                    repeating: 0.0,
+                    count: batchSize * nbChannels * height * width
+                ))
             }
         }
     }
@@ -142,15 +146,15 @@ public class Multiply2D: LayerMerge2D
     {
         try super.checkStateForwardGPU(batchSize: batchSize)
         
-        if _otherOuts.count == 0
+        if _otherOuts2.count == 0
         {
             for _ in 0..<_layersPrev.count
             {
-                let buffer = MetalPrivateBuffer<Float>(
+                let buffer = FloatBuffer(nbElems: 
                     batchSize * nbChannels * height * width,
                     deviceID: deviceID
                 )
-                _otherOuts.append(buffer)
+                _otherOuts2.append(buffer)
             }
         }
     }
@@ -248,9 +252,10 @@ public class Multiply2D: LayerMerge2D
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! Layer2D).outs])
+            buffersPrev.append((_layersPrev[num] as! Layer2D).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -305,8 +310,7 @@ public class Multiply2D: LayerMerge2D
                 var mult = 1.0
                 for num in 0..<_layersPrev.count
                 {
-                    let outsPrevPtr =
-                        (_layersPrev[num] as! Layer2D).outs.shared.buffer
+                    let outsPrevPtr = buffersPrev[num]
                     let neuronsPrev =
                         (_layersPrev[num] as! Layer2D).neurons
                     
@@ -363,8 +367,6 @@ public class Multiply2D: LayerMerge2D
                 
                 for num1 in 0..<_layersPrev.count
                 {
-                    let buffer = (_otherOuts[num1] as! MetalSharedBuffer).buffer
-                    
                     mult = 1.0
                     for num2 in 0..<_layersPrev.count {
                     if num2 != num1
@@ -373,8 +375,7 @@ public class Multiply2D: LayerMerge2D
                             (_layersPrev[num2] as! Layer2D).neurons
                         mult *= neuronsPrev[depth].get(i, j)!.v[elem].out
                     }}
-                    
-                    buffer[offset] = Float(mult)
+                    _otherOuts1[num1][offset] = mult
                 }
             }}
         }}
@@ -441,7 +442,7 @@ public class Multiply2D: LayerMerge2D
                     (_layersPrev[num2] as! Layer2D).outs.metal, atIndex: 0
                 )
                 command.setBytes(pNbElems, atIndex: 1)
-                command.setBuffer(_otherOuts[num1].metal, atIndex: 2)
+                command.setBuffer(_otherOuts2[num1].metal, atIndex: 2)
                 
                 command.dispatchThreads(nbElems)
                 command.enqueue()
@@ -465,7 +466,7 @@ public class Multiply2D: LayerMerge2D
             }
             
             let neuronsPrev = (_layersPrev[num] as! Layer2D).neurons
-            let buffer = (_otherOuts[num] as! MetalSharedBuffer).buffer
+            let buffer = _otherOuts1[num]
             
             for elem in 0..<batchSize {
             for depth in 0..<nbChannels
@@ -525,7 +526,7 @@ public class Multiply2D: LayerMerge2D
             let command = MetalKernel.get.createCommand(
                 "multiplyBackward", deviceID: deviceID
             )
-            command.setBuffer(_otherOuts[num].metal, atIndex: 0)
+            command.setBuffer(_otherOuts2[num].metal, atIndex: 0)
             command.setBuffer(delta.metal, atIndex: 1)
             command.setBytes(pNbElems, atIndex: 2)
             command.setBytes(pDirty, atIndex: 3)
diff --git a/Sources/GrAIdient/Layer2D/Normalize2D.swift b/Sources/GrAIdient/Layer2D/Normalize2D.swift
index 9b0dfec2..e4b236ef 100644
--- a/Sources/GrAIdient/Layer2D/Normalize2D.swift
+++ b/Sources/GrAIdient/Layer2D/Normalize2D.swift
@@ -320,12 +320,12 @@ public class Normalize122D: Layer2D
     /// Squared norm buffer used in the GPU execution context.
     /// Shape ~ (batch, nbThreadgroups).
     ///
-    private var _squaredNorm: MetalPrivateBuffer<Float>! = nil
+    private var _squaredNorm: FloatBuffer! = nil
     ///
     /// Temporary delta buffer used in the GPU execution context.
     /// Shape ~ (batch, nbThreadgroups).
     ///
-    private var _deltaTmp: MetalPrivateBuffer<Float>! = nil
+    private var _deltaTmp: FloatBuffer! = nil
     
     /// Number of thread groups in the GPU execution context.
     var nbThreadgroups: Int
@@ -404,7 +404,7 @@ public class Normalize122D: Layer2D
     {
         if _squaredNorm == nil
         {
-            _squaredNorm = MetalPrivateBuffer<Float>(
+            _squaredNorm = FloatBuffer(nbElems: 
                 batchSize * nbThreadgroups, deviceID: deviceID
             )
         }
@@ -422,7 +422,7 @@ public class Normalize122D: Layer2D
         {
             if _deltaTmp == nil
             {
-                _deltaTmp = MetalPrivateBuffer<Float>(
+                _deltaTmp = FloatBuffer(nbElems: 
                     batchSize * nbThreadgroups, deviceID: deviceID
                 )
             }
diff --git a/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift b/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift
index f341e429..a93b2c9e 100644
--- a/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift
+++ b/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift
@@ -126,7 +126,7 @@ public class SimilarityBatchError2D: LayerOutput2D
     ///     - width: Width of each channel.
     ///
     public override func checkGroundTruthGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
@@ -144,9 +144,10 @@ public class SimilarityBatchError2D: LayerOutput2D
     {
         if loss == nil
         {
-            loss = MetalSharedBuffer<Float>(
-                batchSize * batchSize,
-                deviceID: deviceID
+            loss = FloatBuffer(
+                nbElems: batchSize * batchSize,
+                deviceID: deviceID,
+                shared: true
             )
         }
         else if batchSize <= 0 || batchSize * batchSize > loss.nbElems
@@ -259,9 +260,8 @@ public class SimilarityBatchError2D: LayerOutput2D
         command.dispatchThreads(width: batchSize, height: batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for elem1 in 0..<batchSize {
         for elem2 in 0..<batchSize
         {
diff --git a/Sources/GrAIdient/Layer2D/SimilarityError2D.swift b/Sources/GrAIdient/Layer2D/SimilarityError2D.swift
index c88df693..c158c3a9 100644
--- a/Sources/GrAIdient/Layer2D/SimilarityError2D.swift
+++ b/Sources/GrAIdient/Layer2D/SimilarityError2D.swift
@@ -20,7 +20,7 @@ public class SimilarityError2D: LayerMerge2D
     /// Loss buffer in the GPU execution context.
     /// Shape ~ (batch, batch).
     ///
-    public internal(set) var loss: MetalSharedBuffer<Float>! = nil
+    public internal(set) var loss: FloatBuffer! = nil
     
     /// Batch size sum in the previous layers.
     public var mergedBatchSize: Int
@@ -151,9 +151,10 @@ public class SimilarityError2D: LayerMerge2D
     {
         if loss == nil
         {
-            loss = MetalSharedBuffer<Float>(
-                batchSize * batchSize,
-                deviceID: deviceID
+            loss = FloatBuffer(
+                nbElems: batchSize * batchSize,
+                deviceID: deviceID,
+                shared: true
             )
         }
         else if batchSize <= 0 || batchSize * batchSize > loss.nbElems
@@ -255,9 +256,10 @@ public class SimilarityError2D: LayerMerge2D
     {
         try checkStateCPU(batchSize: mergedBatchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! Layer2D).outs])
+            buffersPrev.append((_layersPrev[num] as! Layer2D).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -300,7 +302,7 @@ public class SimilarityError2D: LayerMerge2D
         for num in 0..<_layersPrev.count
         {
             let batchSize = _layersPrev[num].batchSize
-            let outsPrevPtr = (_layersPrev[num] as! Layer2D).outs.shared.buffer
+            let outsPrevPtr = buffersPrev[num]
             let neuronsPrev = (_layersPrev[num] as! Layer2D).neurons
             
             for batch in 0..<batchSize {
@@ -623,9 +625,8 @@ public class SimilarityError2D: LayerMerge2D
         )
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for elem1 in 0..<mergedBatchSize {
         for elem2 in 0..<mergedBatchSize
         {
diff --git a/Sources/GrAIdient/Layer2D/Sum2D.swift b/Sources/GrAIdient/Layer2D/Sum2D.swift
index 9efc076e..b3016390 100644
--- a/Sources/GrAIdient/Layer2D/Sum2D.swift
+++ b/Sources/GrAIdient/Layer2D/Sum2D.swift
@@ -176,9 +176,10 @@ public class Sum2D: LayerMerge2D
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! Layer2D).outs])
+            buffersPrev.append((_layersPrev[num] as! Layer2D).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -233,8 +234,7 @@ public class Sum2D: LayerMerge2D
                 var sum = 0.0
                 for num in 0..<_layersPrev.count
                 {
-                    let outsPrevPtr =
-                        (_layersPrev[num] as! Layer2D).outs.shared.buffer
+                    let outsPrevPtr = buffersPrev[num]
                     let neuronsPrev =
                         (_layersPrev[num] as! Layer2D).neurons
                     
diff --git a/Sources/GrAIdient/Layer2D/VQ2D.swift b/Sources/GrAIdient/Layer2D/VQ2D.swift
index 8eb0bfd0..80449635 100644
--- a/Sources/GrAIdient/Layer2D/VQ2D.swift
+++ b/Sources/GrAIdient/Layer2D/VQ2D.swift
@@ -62,7 +62,7 @@ public class VQ2D: LayerOutput2D, LayerWeightInit
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, K, nbChannels).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -103,12 +103,7 @@ public class VQ2D: LayerOutput2D, LayerWeightInit
             {
                 return _weightsList
             }
-            
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-        
-            return weightsTmp
+            return _wBuffers.w.download()
         }
         set {
             _weightsList = newValue
@@ -314,23 +309,16 @@ public class VQ2D: LayerOutput2D, LayerWeightInit
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
         if _weightsList.count == 0
         {
-            generateWeightsList(buffer: weightsPtr)
+            generateWeightsList(out: _wBuffers.w, deviceID: deviceID)
         }
         else
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: K * nbChannels
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
         }
-        _weightsList = []
         
-        MetalKernel.get.upload([_wBuffers.w_p!])
+        _weightsList = []
         _wDeltaWeights = nil
     }
     
@@ -365,7 +353,7 @@ public class VQ2D: LayerOutput2D, LayerWeightInit
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * K * nbChannels, deviceID: deviceID
             )
         }
@@ -434,7 +422,7 @@ public class VQ2D: LayerOutput2D, LayerWeightInit
     ///     - width: Width of each channel.
     ///
     public override func checkGroundTruthGPU(
-        _ groundTruth: MetalBuffer<Float>,
+        _ groundTruth: FloatBuffer,
         batchSize: Int,
         nbChannels: Int, height: Int, width: Int) throws
     {
@@ -859,9 +847,8 @@ public class VQ2D: LayerOutput2D, LayerWeightInit
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -949,7 +936,7 @@ public class VQGrad2D: VQ2D
     /// Maximal CAM elements.
     /// Shape ~ (batch, nbThreadgroups).
     ///
-    private var _camMax: MetalPrivateBuffer<Float>! = nil
+    private var _camMax: FloatBuffer! = nil
     
     /// Number of thread groups in the GPU execution context.
     var nbThreadgroups: Int
@@ -1169,7 +1156,7 @@ public class VQGrad2D: VQ2D
         
         if _camMax == nil
         {
-            _camMax = MetalPrivateBuffer<Float>(
+            _camMax = FloatBuffer(nbElems: 
                 batchSize * nbThreadgroups,
                 deviceID: deviceID
             )
diff --git a/Sources/GrAIdient/LayerSeq/ActivationSeq.swift b/Sources/GrAIdient/LayerSeq/ActivationSeq.swift
index 484431cc..39521636 100644
--- a/Sources/GrAIdient/LayerSeq/ActivationSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/ActivationSeq.swift
@@ -16,7 +16,7 @@ public class ActivationSeq: LayerSeq
     /// used in the GPU execution context.
     /// Shape ~ (batch, nbNeurons).
     ///
-    var _tmp: MetalPrivateBuffer<Float>! = nil
+    var _tmp: FloatBuffer! = nil
     
     /// Get coefficient (depending on activation function) to apply during the weights initialization.
     public var coeffInitWeights: Float
diff --git a/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift b/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift
index 960ae791..857057f1 100644
--- a/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift
@@ -15,12 +15,12 @@ open class LayerSeq: Layer
     /// Output buffer (result of the forward pass) used in the GPU execution context.
     /// Shape ~ (batch, seq, nbNeurons).
     ///
-    public var outs: MetalPrivateBuffer<Float>! = nil
+    public var outs: FloatBuffer! = nil
     ///
     /// Gradient buffer (result of the backward pass) used in the GPU execution context.
     /// Shape ~ (batch, seq, nbNeurons).
     ///
-    public var delta: MetalPrivateBuffer<Float>! = nil
+    public var delta: FloatBuffer! = nil
     
     /// Length of the sequence.
     public let sequence: Int
@@ -148,8 +148,9 @@ open class LayerSeq: Layer
     {
         if outs == nil
         {
-            outs = MetalPrivateBuffer<Float>(
-                batchSize * sequence * nbNeurons, deviceID: deviceID
+            outs = FloatBuffer(
+                nbElems: batchSize * sequence * nbNeurons,
+                deviceID: deviceID
             )
         }
         else if batchSize <= 0 || batchSize > outs.nbElems / nbNeurons
@@ -169,8 +170,9 @@ open class LayerSeq: Layer
         {
             if delta == nil
             {
-                delta = MetalPrivateBuffer<Float>(
-                    batchSize * sequence * nbNeurons, deviceID: deviceID
+                delta = FloatBuffer(
+                    nbElems: batchSize * sequence * nbNeurons,
+                    deviceID: deviceID
                 )
             }
             else if batchSize <= 0 ||
diff --git a/Sources/GrAIdient/LayerSeq/ConcatSeq.swift b/Sources/GrAIdient/LayerSeq/ConcatSeq.swift
index b205a439..059ad9ef 100644
--- a/Sources/GrAIdient/LayerSeq/ConcatSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/ConcatSeq.swift
@@ -164,9 +164,10 @@ public class Concat1Seq: LayerMergeSeq
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs])
+            buffersPrev.append((_layersPrev[num] as! LayerSeq).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -213,7 +214,7 @@ public class Concat1Seq: LayerMergeSeq
         for num in 0..<_layersPrev.count
         {
             let layerPrev = _layersPrev[num] as! LayerSeq
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = buffersPrev[num]
             let neuronsPrev = layerPrev.neurons!
             let sequence = layerPrev.sequence
             
@@ -595,9 +596,10 @@ public class Concat2Seq: LayerMergeSeq
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs])
+            buffersPrev.append((_layersPrev[num] as! LayerSeq).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -644,7 +646,7 @@ public class Concat2Seq: LayerMergeSeq
         for num in 0..<_layersPrev.count
         {
             let layerPrev = _layersPrev[num] as! LayerSeq
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = buffersPrev[num]
             let neuronsPrev = layerPrev.neurons!
             let nbNeurons = layerPrev.nbNeurons
             
diff --git a/Sources/GrAIdient/LayerSeq/ConstantSeq.swift b/Sources/GrAIdient/LayerSeq/ConstantSeq.swift
index 3156765e..f8796ecb 100644
--- a/Sources/GrAIdient/LayerSeq/ConstantSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/ConstantSeq.swift
@@ -63,12 +63,7 @@ public class Constant12Seq: LayerSeq, LayerUpdate
             {
                 return _weightsList
             }
-            
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-        
-            return weightsTmp
+            return _wBuffers.w.download()
         }
         set {
             _weightsList = newValue
@@ -261,19 +256,15 @@ public class Constant12Seq: LayerSeq, LayerUpdate
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
         if _weightsList.count != 0
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: sequence * nbNeurons
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
+        }
+        else
+        {
+            _wBuffers.w.initialize()
         }
         _weightsList = []
-        
-        MetalKernel.get.upload([_wBuffers.w_p!])
     }
     
     ///
@@ -339,8 +330,7 @@ public class Constant12Seq: LayerSeq, LayerUpdate
             )
         }}
         
-        MetalKernel.get.download([_wBuffers.w_p!])
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
+        let weightsPtr = _wBuffers.w.download()
     
         for batch in 0..<batchSize {
         for seq in 0..<sequence {
@@ -518,7 +508,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, sequence, nbNeurons).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -558,12 +548,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate
             {
                 return _weightsList
             }
-            
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-        
-            return weightsTmp
+            return _wBuffers.w.download()
         }
         set {
             _weightsList = newValue
@@ -755,19 +740,16 @@ public class Constant2Seq: LayerSeq, LayerUpdate
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
         if _weightsList.count != 0
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: nbNeurons
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
+        }
+        else
+        {
+            _wBuffers.w.initialize()
         }
-        _weightsList = []
         
-        MetalKernel.get.upload([_wBuffers.w_p!])
+        _weightsList = []
         _wDeltaWeights = nil
     }
     
@@ -784,7 +766,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * sequence * nbNeurons, deviceID: deviceID
             )
         }
@@ -852,8 +834,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate
             )
         }}
         
-        MetalKernel.get.download([_wBuffers.w_p!])
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
+        let weightsPtr = _wBuffers.w.download()
     
         for batch in 0..<batchSize {
         for seq in 0..<sequence {
@@ -1066,8 +1047,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wDeltaWeights])
-        let deltaWeightsPtr = _wDeltaWeights.shared.buffer
+        let deltaWeightsPtr = _wDeltaWeights.download()
         
         for depth in 0..<nbNeurons
         {
@@ -1113,8 +1093,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wBuffers.g_p!])
-        let deltaWeightsPtr = _wBuffers.g_p!.shared.buffer
+        let deltaWeightsPtr = _wBuffers.g.download()
         
         for i in 0..<_wBuffers.nbElems
         {
diff --git a/Sources/GrAIdient/LayerSeq/FullyConnectedPatch.swift b/Sources/GrAIdient/LayerSeq/FullyConnectedPatch.swift
index 31832609..69fd40bb 100644
--- a/Sources/GrAIdient/LayerSeq/FullyConnectedPatch.swift
+++ b/Sources/GrAIdient/LayerSeq/FullyConnectedPatch.swift
@@ -45,12 +45,12 @@ public class FullyConnectedPatch: ActivationSeq,
     /// Buffer of gradients per sample for weights.
     /// Shape ~ (batch, nbNeurons, nbNeuronsPrev x patch x patch).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     ///
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, nbNeurons).
     ///
-    var _bDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _bDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -106,14 +106,10 @@ public class FullyConnectedPatch: ActivationSeq,
                 return _weightsList
             }
             
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-            
+            var weightsTmp = _wBuffers.w.download()
             if _updateBiases
             {
-                MetalKernel.get.download([_bBuffers.w_p!])
-                weightsTmp += _bBuffers.w_p!.shared.array
+                weightsTmp += _bBuffers.w.download()
             }
             return weightsTmp
         }
@@ -467,34 +463,24 @@ public class FullyConnectedPatch: ActivationSeq,
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
-        let biasesPtr = _bBuffers.w_p!.shared.buffer
-        
+        _bBuffers.w.initialize()
         if _weightsList.count == 0
         {
-            generateWeightsList(buffer: weightsPtr)
+            generateWeightsList(out: _wBuffers.w, deviceID: deviceID)
         }
         else
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: weightHeight * weightWidth
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
             if _updateBiases
             {
-                copyFloatArrayToBuffer(
+                _bBuffers.w.initialize(
                     array: &_weightsList,
-                    buffer: biasesPtr,
-                    start: weightHeight * weightWidth,
-                    nbElems: weightHeight
+                    start: weightHeight * weightWidth
                 )
             }
         }
-        _weightsList = []
         
-        MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!])
+        _weightsList = []
         
         _wDeltaWeights = nil
         _bDeltaWeights = nil
@@ -513,14 +499,14 @@ public class FullyConnectedPatch: ActivationSeq,
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * sequence * nbNeurons * weightWidth,
                 deviceID: deviceID
             )
             
             if _updateBiases
             {
-                _bDeltaWeights = MetalPrivateBuffer<Float>(
+                _bDeltaWeights = FloatBuffer(nbElems: 
                     batchSize * sequence * nbNeurons, deviceID: deviceID
                 )
             }
@@ -715,11 +701,8 @@ public class FullyConnectedPatch: ActivationSeq,
                 )
             }}
             
-            MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!])
-            MetalKernel.get.download([layerPrev.outs])
-            
-            let weightsPtr = _wBuffers.w_p!.shared.buffer
-            let biasesPtr = _bBuffers.w_p!.shared.buffer
+            let weightsPtr = _wBuffers.w.download()
+            let biasesPtr = _bBuffers.w.download()
             
             let nbSeqPerCol = layerPrev.width / _patch
             let neuronsPrev = layerPrev.neurons
@@ -757,7 +740,7 @@ public class FullyConnectedPatch: ActivationSeq,
                 }
             }}}
             
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = layerPrev.outs.download()
             
             for batch in 0..<batchSize {
             for seq in 0..<sequence
@@ -1283,7 +1266,7 @@ public class FullyConnectedPatch: ActivationSeq,
     /// Get the weights in the CPU execution context.
     public func collectWeightsCPU() -> [IWeightArrays]
     {
-        var weights = [IWeightArrays]()
+        var weights = [WeightArrays]()
         weights.append(_wArrays)
         if _updateBiases
         {
@@ -1325,8 +1308,7 @@ public class FullyConnectedPatch: ActivationSeq,
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wDeltaWeights])
-        var deltaWeightsPtr = _wDeltaWeights.shared.buffer
+        var deltaWeightsPtr = _wDeltaWeights.download()
         
         let offsetStart = elem * nbNeurons * weightWidth
         for depth in 0..<nbNeurons {
@@ -1341,8 +1323,7 @@ public class FullyConnectedPatch: ActivationSeq,
         
         if _updateBiases
         {
-            MetalKernel.get.download([_bDeltaWeights])
-            deltaWeightsPtr = _bDeltaWeights.shared.buffer
+            deltaWeightsPtr = _bDeltaWeights.download()
             
             for depth in 0..<nbNeurons
             {
@@ -1397,8 +1378,7 @@ public class FullyConnectedPatch: ActivationSeq,
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wBuffers.g_p!])
-        var deltaWeightsPtr = _wBuffers.g_p!.shared.buffer
+        var deltaWeightsPtr = _wBuffers.g.download()
         
         for i in 0..<_wBuffers.nbElems
         {
@@ -1406,8 +1386,7 @@ public class FullyConnectedPatch: ActivationSeq,
         }
         if _updateBiases
         {
-            MetalKernel.get.download([_bBuffers.g_p!])
-            deltaWeightsPtr = _bBuffers.g_p!.shared.buffer
+            deltaWeightsPtr = _bBuffers.g.download()
             
             for i in 0..<_bBuffers.nbElems
             {
diff --git a/Sources/GrAIdient/LayerSeq/FullyConnectedSeq.swift b/Sources/GrAIdient/LayerSeq/FullyConnectedSeq.swift
index 0a45391e..c959b30b 100644
--- a/Sources/GrAIdient/LayerSeq/FullyConnectedSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/FullyConnectedSeq.swift
@@ -37,12 +37,12 @@ public class FullyConnectedSeq: ActivationSeq,
     /// Buffer of gradients per sample for weights.
     /// Shape ~ (batch, nbNeurons, nbNeuronsPrev).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     ///
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, nbNeurons).
     ///
-    var _bDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _bDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -98,14 +98,10 @@ public class FullyConnectedSeq: ActivationSeq,
                 return _weightsList
             }
             
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-            
+            var weightsTmp = _wBuffers.w.download()
             if _updateBiases
             {
-                MetalKernel.get.download([_bBuffers.w_p!])
-                weightsTmp += _bBuffers.w_p!.shared.array
+                weightsTmp += _bBuffers.w.download()
             }
             return weightsTmp
         }
@@ -442,35 +438,24 @@ public class FullyConnectedSeq: ActivationSeq,
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
-        let biasesPtr = _bBuffers.w_p!.shared.buffer
-        
+        _bBuffers.w.initialize()
         if _weightsList.count == 0
         {
-            generateWeightsList(buffer: weightsPtr)
+            generateWeightsList(out: _wBuffers.w, deviceID: deviceID)
         }
         else
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0, 
-                nbElems: weightHeight * weightWidth
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
             if _updateBiases
             {
-                copyFloatArrayToBuffer(
+                _bBuffers.w.initialize(
                     array: &_weightsList,
-                    buffer: biasesPtr,
-                    start: weightHeight * weightWidth,
-                    nbElems: weightHeight
+                    start: weightHeight * weightWidth
                 )
             }
         }
-        _weightsList = []
-        
-        MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!])
         
+        _weightsList = []
         _wDeltaWeights = nil
         _bDeltaWeights = nil
     }
@@ -488,14 +473,14 @@ public class FullyConnectedSeq: ActivationSeq,
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * sequence * nbNeurons * weightWidth,
                 deviceID: deviceID
             )
             
             if _updateBiases
             {
-                _bDeltaWeights = MetalPrivateBuffer<Float>(
+                _bDeltaWeights = FloatBuffer(nbElems: 
                     batchSize * sequence * nbNeurons, deviceID: deviceID
                 )
             }
@@ -656,11 +641,8 @@ public class FullyConnectedSeq: ActivationSeq,
                 )
             }}
             
-            MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!])
-            MetalKernel.get.download([layerPrev.outs])
-            
-            let weightsPtr = _wBuffers.w_p!.shared.buffer
-            let biasesPtr = _bBuffers.w_p!.shared.buffer
+            let weightsPtr = _wBuffers.w.download()
+            let biasesPtr = _bBuffers.w.download()
             
             let neuronsPrev = layerPrev.neurons!
             let nbNeuronsPrev = layerPrev.nbNeurons
@@ -685,7 +667,7 @@ public class FullyConnectedSeq: ActivationSeq,
                 }
             }}}
             
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = layerPrev.outs.download()
             
             for batch in 0..<batchSize {
             for seq in 0..<sequence
@@ -1168,7 +1150,7 @@ public class FullyConnectedSeq: ActivationSeq,
     /// Get the weights in the CPU execution context.
     public func collectWeightsCPU() -> [IWeightArrays]
     {
-        var weights = [IWeightArrays]()
+        var weights = [WeightArrays]()
         weights.append(_wArrays)
         if _updateBiases
         {
@@ -1210,8 +1192,7 @@ public class FullyConnectedSeq: ActivationSeq,
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wDeltaWeights])
-        var deltaWeightsPtr = _wDeltaWeights.shared.buffer
+        var deltaWeightsPtr = _wDeltaWeights.download()
         
         let offsetStart = elem * nbNeurons * weightWidth
         for depth in 0..<nbNeurons {
@@ -1226,8 +1207,7 @@ public class FullyConnectedSeq: ActivationSeq,
         
         if _updateBiases
         {
-            MetalKernel.get.download([_bDeltaWeights])
-            deltaWeightsPtr = _bDeltaWeights.shared.buffer
+            deltaWeightsPtr = _bDeltaWeights.download()
             
             for depth in 0..<nbNeurons
             {
@@ -1282,8 +1262,7 @@ public class FullyConnectedSeq: ActivationSeq,
         }
         
         var deltaWeights = [T]()
-        MetalKernel.get.download([_wBuffers.g_p!])
-        var deltaWeightsPtr = _wBuffers.g_p!.shared.buffer
+        var deltaWeightsPtr = _wBuffers.g.download()
         
         for i in 0..<_wBuffers.nbElems
         {
@@ -1291,8 +1270,7 @@ public class FullyConnectedSeq: ActivationSeq,
         }
         if _updateBiases
         {
-            MetalKernel.get.download([_bBuffers.g_p!])
-            deltaWeightsPtr = _bBuffers.g_p!.shared.buffer
+            deltaWeightsPtr = _bBuffers.g.download()
             
             for i in 0..<_bBuffers.nbElems
             {
diff --git a/Sources/GrAIdient/LayerSeq/LayerNormSeq.swift b/Sources/GrAIdient/LayerSeq/LayerNormSeq.swift
index c1289e96..ca4afb6b 100644
--- a/Sources/GrAIdient/LayerSeq/LayerNormSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/LayerNormSeq.swift
@@ -459,8 +459,7 @@ public class LayerNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
                 }
             }}}
             
-            MetalKernel.get.download([layerPrev.outs])
-            let outsPrevPtr = layerPrev.outs.shared.buffer
+            let outsPrevPtr = layerPrev.outs.download()
             
             // Prepare GC for norm weights: Ɣ and β.
             for batch in 0..<batchSize {
diff --git a/Sources/GrAIdient/LayerSeq/QuerySeq.swift b/Sources/GrAIdient/LayerSeq/QuerySeq.swift
index f0101c9e..180403cb 100644
--- a/Sources/GrAIdient/LayerSeq/QuerySeq.swift
+++ b/Sources/GrAIdient/LayerSeq/QuerySeq.swift
@@ -220,11 +220,6 @@ public class QuerySeq: LayerMergeSeq
     {
         try checkStateCPU(batchSize: batchSize)
         
-        for num in 0..<_layersPrev.count
-        {
-            MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs])
-        }
-        
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
         
         var nbGC = nbSameElems
@@ -265,10 +260,8 @@ public class QuerySeq: LayerMergeSeq
                 sum / sqrt(Double(size))
         }}}}}
         
-        let queryBuffer =
-            (_layersPrev[0] as! LayerSeq).outs.shared.buffer
-        let keyBuffer =
-            (_layersPrev[1] as! LayerSeq).outs.shared.buffer
+        let queryBuffer = (_layersPrev[0] as! LayerSeq).outs.download()
+        let keyBuffer = (_layersPrev[1] as! LayerSeq).outs.download()
         
         for batch in 0..<batchSize {
         for head in 0..<_nbHeads {
diff --git a/Sources/GrAIdient/LayerSeq/SumSeq.swift b/Sources/GrAIdient/LayerSeq/SumSeq.swift
index 909b5a9f..57bd5077 100644
--- a/Sources/GrAIdient/LayerSeq/SumSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/SumSeq.swift
@@ -161,9 +161,10 @@ public class SumSeq: LayerMergeSeq
     {
         try checkStateCPU(batchSize: batchSize)
         
+        var buffersPrev = [[Float]]()
         for num in 0..<_layersPrev.count
         {
-            MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs])
+            buffersPrev.append((_layersPrev[num] as! LayerSeq).outs.download())
         }
         
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
@@ -206,8 +207,7 @@ public class SumSeq: LayerMergeSeq
             var sum = 0.0
             for num in 0..<_layersPrev.count
             {
-                let outsPrevPtr =
-                    (_layersPrev[num] as! LayerSeq).outs.shared.buffer
+                let outsPrevPtr = buffersPrev[num]
                 let neuronsPrev =
                     (_layersPrev[num] as! LayerSeq).neurons!
                 
diff --git a/Sources/GrAIdient/LayerSeq/VQSeq.swift b/Sources/GrAIdient/LayerSeq/VQSeq.swift
index d25443e8..669fbc43 100644
--- a/Sources/GrAIdient/LayerSeq/VQSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/VQSeq.swift
@@ -23,7 +23,7 @@ public class VQSeq: LayerSeq, LayerWeightInit
     /// Loss buffer in the GPU execution context.
     /// Shape ~ (batch,).
     ///
-    public internal(set) var loss: MetalSharedBuffer<Float>! = nil
+    public internal(set) var loss: FloatBuffer! = nil
     ///
     /// Indices of maximal elements.
     /// Shape ~ (batch, seq).
@@ -46,7 +46,7 @@ public class VQSeq: LayerSeq, LayerWeightInit
     /// Buffer of gradients per sample for biases.
     /// Shape ~ (batch, K, nbNeurons).
     ///
-    var _wDeltaWeights: MetalPrivateBuffer<Float>! = nil
+    var _wDeltaWeights: FloatBuffer! = nil
     
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
@@ -87,12 +87,7 @@ public class VQSeq: LayerSeq, LayerWeightInit
             {
                 return _weightsList
             }
-            
-            var weightsTmp = [Float]()
-            MetalKernel.get.download([_wBuffers.w_p!])
-            weightsTmp += _wBuffers.w_p!.shared.array
-        
-            return weightsTmp
+            return _wBuffers.w.download()
         }
         set {
             _weightsList = newValue
@@ -304,23 +299,16 @@ public class VQSeq: LayerSeq, LayerWeightInit
             deviceID: deviceID
         )
         
-        let weightsPtr = _wBuffers.w_p!.shared.buffer
         if _weightsList.count == 0
         {
-            generateWeightsList(buffer: weightsPtr)
+            generateWeightsList(out: _wBuffers.w, deviceID: deviceID)
         }
         else
         {
-            copyFloatArrayToBuffer(
-                array: &_weightsList,
-                buffer: weightsPtr,
-                start: 0,
-                nbElems: K * nbNeurons
-            )
+            _wBuffers.w.initialize(array: &_weightsList)
         }
-        _weightsList = []
         
-        MetalKernel.get.upload([_wBuffers.w_p!])
+        _weightsList = []
         _wDeltaWeights = nil
     }
     
@@ -355,7 +343,7 @@ public class VQSeq: LayerSeq, LayerWeightInit
         if computeDeltaWeights &&
            GrAI.Gradient.sample && _wDeltaWeights == nil
         {
-            _wDeltaWeights = MetalPrivateBuffer<Float>(
+            _wDeltaWeights = FloatBuffer(nbElems: 
                 batchSize * K * nbNeurons, deviceID: deviceID
             )
         }
@@ -380,7 +368,9 @@ public class VQSeq: LayerSeq, LayerWeightInit
     {
         if loss == nil
         {
-            loss = MetalSharedBuffer<Float>(batchSize, deviceID: deviceID)
+            loss = FloatBuffer(
+                nbElems: batchSize, deviceID: deviceID, shared: true
+            )
         }
         else if batchSize <= 0 || batchSize > loss.nbElems
         {
@@ -778,9 +768,8 @@ public class VQSeq: LayerSeq, LayerWeightInit
         command.dispatchThreads(batchSize)
         command.enqueue()
         
-        MetalKernel.get.download([loss])
         var loss: Float = 0.0
-        let lossPtr = self.loss.buffer
+        let lossPtr = self.loss.download()
         for i in 0..<batchSize
         {
             loss += lossPtr[i]
@@ -867,7 +856,7 @@ public class VQGradSeq: VQSeq
     /// Maximal CAM elements.
     /// Shape ~ (batch, nbThreadgroups).
     ///
-    private var _camMax: MetalPrivateBuffer<Float>! = nil
+    private var _camMax: FloatBuffer! = nil
     
     /// Number of thread groups in the GPU execution context.
     var nbThreadgroups: Int
@@ -1087,7 +1076,7 @@ public class VQGradSeq: VQSeq
         
         if _camMax == nil
         {
-            _camMax = MetalPrivateBuffer<Float>(
+            _camMax = FloatBuffer(nbElems: 
                 batchSize * nbThreadgroups,
                 deviceID: deviceID
             )
diff --git a/Sources/GrAIdient/LayerSeq/ValueSeq.swift b/Sources/GrAIdient/LayerSeq/ValueSeq.swift
index 09d6b70a..2507e484 100644
--- a/Sources/GrAIdient/LayerSeq/ValueSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/ValueSeq.swift
@@ -223,11 +223,6 @@ public class ValueSeq: LayerMergeSeq
     {
         try checkStateCPU(batchSize: batchSize)
         
-        for num in 0..<_layersPrev.count
-        {
-            MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs])
-        }
-        
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
         
         var nbGC = nbSameElems
@@ -268,10 +263,8 @@ public class ValueSeq: LayerMergeSeq
             neurons.get(seqQ, depth)!.gc[batch][elem].out = sum
         }}}}}
         
-        let valueBuffer =
-            (_layersPrev[0] as! LayerSeq).outs.shared.buffer
-        let scoreBuffer =
-            (_layersPrev[1] as! LayerSeq).outs.shared.buffer
+        let valueBuffer = (_layersPrev[0] as! LayerSeq).outs.download()
+        let scoreBuffer = (_layersPrev[1] as! LayerSeq).outs.download()
         
         for batch in 0..<batchSize {
         for head in 0..<_nbHeads {
@@ -797,11 +790,6 @@ public class ValueSelfSeq: LayerMergeSeq
     {
         try checkStateCPU(batchSize: batchSize)
         
-        for num in 0..<_layersPrev.count
-        {
-            MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs])
-        }
-        
         let (nbSameElems, layersIndex, nbElems) = getMergedGraph()
         
         var nbGC = nbSameElems
@@ -847,10 +835,8 @@ public class ValueSelfSeq: LayerMergeSeq
             neurons.get(seqQ, depth)!.gc[batch][elem].out = sum
         }}}}}
         
-        let valueBuffer =
-            (_layersPrev[0] as! LayerSeq).outs.shared.buffer
-        let scoreBuffer =
-            (_layersPrev[1] as! LayerSeq).outs.shared.buffer
+        let valueBuffer = (_layersPrev[0] as! LayerSeq).outs.download()
+        let scoreBuffer = (_layersPrev[1] as! LayerSeq).outs.download()
         
         for batch in 0..<batchSize {
         for head in 0..<_nbHeads {
diff --git a/Sources/GrAIdient/Metal/Kernel/Activation.metal b/Sources/GrAIdient/Metal/Kernel/ActivationFloat.metal
similarity index 94%
rename from Sources/GrAIdient/Metal/Kernel/Activation.metal
rename to Sources/GrAIdient/Metal/Kernel/ActivationFloat.metal
index a4371bbb..39ece492 100644
--- a/Sources/GrAIdient/Metal/Kernel/Activation.metal
+++ b/Sources/GrAIdient/Metal/Kernel/ActivationFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void forwardReLU(
+kernel void forwardReLUFloat(
     constant uint * pNbElems,
     device float * tmps,
     device float * outs,
@@ -39,7 +39,7 @@ kernel void forwardReLU(
     }
 }
 
-kernel void backwardReLU(
+kernel void backwardReLUFloat(
     const device float * tmps,
     constant uint * pNbElems,
     device float * delta,
@@ -65,7 +65,7 @@ kernel void backwardReLU(
     }
 }
 
-kernel void forwardLeakyReLU(
+kernel void forwardLeakyReLUFloat(
     constant uint * pNbElems,
     device float * tmps,
     device float * outs,
@@ -97,7 +97,7 @@ kernel void forwardLeakyReLU(
     }
 }
 
-kernel void backwardLeakyReLU(
+kernel void backwardLeakyReLUFloat(
     const device float * tmps,
     constant uint * pNbElems,
     device float * delta,
@@ -124,7 +124,7 @@ kernel void backwardLeakyReLU(
     }
 }
 
-kernel void forwardSoftReLU(
+kernel void forwardSoftReLUFloat(
     constant uint * pNbElems,
     device float * tmps,
     device float * outs,
@@ -149,7 +149,7 @@ kernel void forwardSoftReLU(
     outs[id] = Ɛ * tmps[id] + (1 - Ɛ) * log(1 + exp(tmps[id]));
 }
 
-kernel void backwardSoftReLU(
+kernel void backwardSoftReLUFloat(
     const device float * tmps,
     constant uint * pNbElems,
     device float * delta,
@@ -174,7 +174,7 @@ kernel void backwardSoftReLU(
     delta[id] = delta[id] * derivative;
 }
 
-kernel void forwardSigmoid(
+kernel void forwardSigmoidFloat(
    constant uint * pNbElems,
    device float * tmps,
    device float * outs,
@@ -205,7 +205,7 @@ kernel void forwardSigmoid(
     }
 }
 
-kernel void backwardSigmoid(
+kernel void backwardSigmoidFloat(
     const device float * tmps,
     constant uint * pNbElems,
     device float * delta,
@@ -239,7 +239,7 @@ kernel void backwardSigmoid(
     delta[id] = delta[id] * derivative;
 }
 
-kernel void forwardGELUApprox(
+kernel void forwardGELUApproxFloat(
    constant uint * pNbElems,
    device float * tmps,
    device float * outs,
@@ -275,7 +275,7 @@ kernel void forwardGELUApprox(
     outs[id] = 0.5 * x * (1 + tmp2);
 }
 
-kernel void backwardGELUApprox(
+kernel void backwardGELUApproxFloat(
     const device float * tmps,
     constant uint * pNbElems,
     device float * delta,
@@ -350,7 +350,7 @@ float erf(float a)
     return r;
 }
 
-kernel void forwardGELU(
+kernel void forwardGELUFloat(
    constant uint * pNbElems,
    device float * tmps,
    device float * outs,
@@ -375,7 +375,7 @@ kernel void forwardGELU(
     outs[id] = 0.5 * x * (1 + erf(x / sqrt(2.0)));
 }
 
-kernel void backwardGELU(
+kernel void backwardGELUFloat(
     const device float * tmps,
     constant uint * pNbElems,
     device float * delta,
diff --git a/Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal b/Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal
new file mode 100644
index 00000000..a3e089f5
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal
@@ -0,0 +1,403 @@
+//
+// Activation.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void forwardReLUHalf(
+    constant uint * pNbElems,
+    device half * tmps,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    tmps[id] = outs[id];
+    if (tmps[id] < 0)
+    {
+        outs[id] = 0.0;
+    }
+    else
+    {
+        outs[id] = tmps[id];
+    }
+}
+
+kernel void backwardReLUHalf(
+    const device half * tmps,
+    constant uint * pNbElems,
+    device half * delta,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    if (tmps[id] < 0)
+    {
+        delta[id] = 0.0;
+    }
+}
+
+kernel void forwardLeakyReLUHalf(
+    constant uint * pNbElems,
+    device half * tmps,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float Ɛ = 0.01;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    tmps[id] = outs[id];
+    if (tmps[id] < 0)
+    {
+        outs[id] = Ɛ * tmps[id];
+    }
+    else
+    {
+        outs[id] = tmps[id];
+    }
+}
+
+kernel void backwardLeakyReLUHalf(
+    const device half * tmps,
+    constant uint * pNbElems,
+    device half * delta,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float Ɛ = 0.01;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    if (tmps[id] < 0)
+    {
+        delta[id] = Ɛ * delta[id];
+    }
+}
+
+kernel void forwardSoftReLUHalf(
+    constant uint * pNbElems,
+    device half * tmps,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float Ɛ = 0.01;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    tmps[id] = outs[id];
+    outs[id] = Ɛ * tmps[id] + (1 - Ɛ) * log(1 + exp(tmps[id]));
+}
+
+kernel void backwardSoftReLUHalf(
+    const device half * tmps,
+    constant uint * pNbElems,
+    device half * delta,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float Ɛ = 0.01;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float derivative = Ɛ + (1 - Ɛ) / (1 + exp(-tmps[id]));
+    delta[id] = delta[id] * derivative;
+}
+
+kernel void forwardSigmoidHalf(
+   constant uint * pNbElems,
+   device half * tmps,
+   device half * outs,
+   uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    tmps[id] = outs[id];
+    if (tmps[id] >= 0)
+    {
+        outs[id] = 1.0 / (1.0 + exp(-tmps[id]));
+    }
+    else
+    {
+        outs[id] = exp(tmps[id]) / (1.0 + exp(tmps[id]));
+    }
+}
+
+kernel void backwardSigmoidHalf(
+    const device half * tmps,
+    constant uint * pNbElems,
+    device half * delta,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float tmp;
+    if (tmps[id] >= 0)
+    {
+        tmp = 1.0 / (1.0 + exp(-tmps[id]));
+    }
+    else
+    {
+        tmp = exp(tmps[id]) / (1.0 + exp(tmps[id]));
+    }
+    
+    float derivative = tmp * (1 - tmp);
+    delta[id] = delta[id] * derivative;
+}
+
+kernel void forwardGELUApproxHalf(
+   constant uint * pNbElems,
+   device half * tmps,
+   device half * outs,
+   uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float cst = sqrt(2.0 / 3.14159);
+    float x = outs[id];
+    float tmp1 = cst * (x + 0.044715 * pow(x, 3));
+    float tmp2;
+    if (tmp1 >= 0)
+    {
+        tmp2 = (1.0 - exp(-2.0 * tmp1)) / (1.0 + exp(-2.0 * tmp1));
+    }
+    else
+    {
+        tmp2 = (exp(2.0 * tmp1) - 1.0) / (exp(2.0 * tmp1) + 1.0);
+    }
+    tmps[id] = x;
+    outs[id] = 0.5 * x * (1 + tmp2);
+}
+
+kernel void backwardGELUApproxHalf(
+    const device half * tmps,
+    constant uint * pNbElems,
+    device half * delta,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float cst = sqrt(2.0 / 3.14159);
+    float x = tmps[id];
+    float tmp1 = cst * (x + 0.044715 * pow(x, 3));
+    float tmp2;
+    if (tmp1 >= 0)
+    {
+        tmp2 = (1.0 - exp(-2.0 * tmp1)) / (1.0 + exp(-2.0 * tmp1));
+    }
+    else
+    {
+        tmp2 = (exp(2.0 * tmp1) - 1.0) / (exp(2.0 * tmp1) + 1.0);
+    }
+    float tmp3 = cst * (1 + 3 * 0.044715 * x * x) * (1 - tmp2 * tmp2);
+    float derivative = 0.5 * (1 + tmp2 + x * tmp3);
+    delta[id] = delta[id] * derivative;
+}
+
+/*
+ * Approximation to the error function.
+ * Based on code from:
+ * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199
+ */
+float erf(float a)
+{
+    float r, s, t, u;
+    t = metal::abs(a);
+    s = a * a;
+    if (t > 0.927734375f)
+    {
+        // maximum error 0.99527 ulp
+        r = metal::fma(-1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+        u = metal::fma(-3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+        r = metal::fma(r, s, u);
+        r = metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+        r = metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+        r = metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+        r = metal::fma(r, t, -t);
+        // TODO, replace with expm1 when implemented
+        r = 1.0f - metal::exp(r);
+        r = metal::copysign(r, a);
+    }
+    else
+    {
+        // maximum error 0.98929 ulp
+        r = -5.96761703e-4f; // -0x1.38e000p-11
+        r = metal::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
+        r = metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+        r = metal::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
+        r = metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
+        r = metal::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
+        r = metal::fma(r, a, a);
+    }
+    return r;
+}
+
+kernel void forwardGELUHalf(
+   constant uint * pNbElems,
+   device half * tmps,
+   device half * outs,
+   uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float x = outs[id];
+    tmps[id] = x;
+    outs[id] = 0.5 * x * (1 + erf(x / sqrt(2.0)));
+}
+
+kernel void backwardGELUHalf(
+    const device half * tmps,
+    constant uint * pNbElems,
+    device half * delta,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float x = tmps[id];
+    float tmp1 = 0.5 * (1.0 + erf(x / sqrt(2.0)));
+    float tmp2 = x / sqrt(2.0 * M_PI_F) * exp(-x * x / 2.0);
+    float derivative = tmp1 + tmp2;
+    delta[id] = delta[id] * derivative;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/BatchNorm.metal b/Sources/GrAIdient/Metal/Kernel/BatchNormFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/BatchNorm.metal
rename to Sources/GrAIdient/Metal/Kernel/BatchNormFloat.metal
index 413ab070..355a3ff8 100644
--- a/Sources/GrAIdient/Metal/Kernel/BatchNorm.metal
+++ b/Sources/GrAIdient/Metal/Kernel/BatchNormFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void computeBNConvμ(
+kernel void computeBNConvμFloat(
     const device float * tmps,
     constant uint * pNbChannels,
     constant uint * pNbBatch,
@@ -67,7 +67,7 @@ kernel void computeBNConvμ(
     }
 }
 
-kernel void computeBNConvσ2(
+kernel void computeBNConvσ2Float(
     const device float * tmps,
     const device float * μ,
     constant uint * pNbChannels,
@@ -128,7 +128,7 @@ kernel void computeBNConvσ2(
     }
 }
 
-kernel void forwardBNConvTraining(
+kernel void forwardBNConvTrainingFloat(
     const device float * β,
     const device float * Ɣ,
     const device float * μ,
@@ -178,7 +178,7 @@ kernel void forwardBNConvTraining(
     tmps[offset] = Ɣ[depth] * xhat + β[depth];
 }
 
-kernel void forwardBNConvInference(
+kernel void forwardBNConvInferenceFloat(
     const device float * β,
     const device float * Ɣ,
     const device float * Eμ,
@@ -234,7 +234,7 @@ kernel void forwardBNConvInference(
     tmps[offset] = Ɣ[depth] * xhat + β[depth];
 }
 
-kernel void backwardWeightsBNConv(
+kernel void backwardWeightsBNConvFloat(
     const device float * delta,
     const device float * xHat,
     const device float * Ɣ,
@@ -308,7 +308,7 @@ kernel void backwardWeightsBNConv(
     }
 }
 
-kernel void backwardBNConvTraining(
+kernel void backwardBNConvTrainingFloat(
     const device float * σ2,
     const device float * xHat,
     const device float * Ɣ,
@@ -361,7 +361,7 @@ kernel void backwardBNConvTraining(
     delta[offset] = mult * (tmp1 - tmp2 - tmp3);
 }
 
-kernel void backwardBNConvInference(
+kernel void backwardBNConvInferenceFloat(
     const device float * Ɣ,
     const device float * Eσ2,
     constant uint * pNbChannels,
diff --git a/Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal b/Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal
new file mode 100644
index 00000000..4872c749
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal
@@ -0,0 +1,415 @@
+//
+// BatchNorm.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void computeBNConvμHalf(
+    const device half * tmps,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    constant uint * pFirstCall,
+    device half * μ,
+    device half * Eμ,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    uint firstCall;
+    
+    if (pNbChannels && pNbBatch && pDimensions && pFirstCall && tmps &&
+        μ && Eμ)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        firstCall = *pFirstCall;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbChannels)
+    {
+        return ;
+    }
+    
+    uint nbElems = nbBatch * width * height;
+    float sum = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        for (uint x=0; x<width; x++){
+        for (uint y=0; y<height; y++)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = y + (offsetStart + x) * width;
+                
+            sum += tmps[offset];
+        }}
+    }
+    μ[depth] = sum / nbElems;
+    
+    if (pFirstCall)
+    {
+        Eμ[depth] = μ[depth];
+    }
+    else
+    {
+        Eμ[depth] = 0.9 * Eμ[depth] + 0.1 * μ[depth];
+    }
+}
+
+kernel void computeBNConvσ2Half(
+    const device half * tmps,
+    const device half * μ,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    constant uint * pFirstCall,
+    device half * σ2,
+    device half * Eσ2,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    uint firstCall;
+    
+    if (pNbChannels && pNbBatch && pDimensions && pFirstCall &&
+        tmps && μ && σ2 && Eσ2)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        firstCall = *pFirstCall;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbChannels)
+    {
+        return ;
+    }
+    
+    uint nbElems = nbBatch * width * height;
+    float sum = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        for (uint x=0; x<width; x++){
+        for (uint y=0; y<height; y++)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = y + (offsetStart + x) * width;
+                
+            float tmp = tmps[offset] - μ[depth];
+            sum += tmp * tmp;
+        }}
+    }
+    σ2[depth] = sum / nbElems;
+    
+    if (firstCall)
+    {
+        Eσ2[depth] = σ2[depth];
+    }
+    else
+    {
+        Eσ2[depth] = 0.9 * Eσ2[depth] + 0.1 * σ2[depth];
+    }
+}
+
+kernel void forwardBNConvTrainingHalf(
+    const device half * β,
+    const device half * Ɣ,
+    const device half * μ,
+    const device half * σ2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    device half * tmps,
+    device half * xHat,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pDimensions && β && Ɣ &&
+        tmps && xHat && μ && σ2)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float tmp1 = tmps[offset] - μ[depth];
+    float tmp2 = sqrt(σ2[depth] + Ɛ);
+    float xhat = tmp1 / tmp2;
+    xHat[offset] = xhat;
+    tmps[offset] = Ɣ[depth] * xhat + β[depth];
+}
+
+kernel void forwardBNConvInferenceHalf(
+    const device half * β,
+    const device half * Ɣ,
+    const device half * Eμ,
+    const device half * Eσ2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pM,
+    constant uint * pDimensions,
+    device half * tmps,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint m;
+    uint width;
+    uint height;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pM && pDimensions && β && Ɣ &&
+        tmps && Eμ && Eσ2)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        m = *pM;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float Var = Eσ2[depth];
+    if (m > 1)
+    {
+        Var *= (float)m / ((float)m - 1);
+    }
+    float tmp1 = tmps[offset] - Eμ[depth];
+    float tmp2 = sqrt(Var + Ɛ);
+    float xhat = tmp1 / tmp2;
+    tmps[offset] = Ɣ[depth] * xhat + β[depth];
+}
+
+kernel void backwardWeightsBNConvHalf(
+    const device half * delta,
+    const device half * xHat,
+    const device half * Ɣ,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    constant uint * pAccumulate,
+    device half * sum1,
+    device half * sum2,
+    device half * dƔ,
+    device half * dβ,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    uint accumulate;
+    
+    if (pNbChannels && pNbBatch && pDimensions && pAccumulate &&
+        delta && xHat && Ɣ &&
+        sum1 && sum2 && dƔ && dβ)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbChannels)
+    {
+        return ;
+    }
+    
+    float tmp1 = 0.0, tmp2 = 0.0;
+    float tmp3 = 0.0, tmp4 = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        for (uint x=0; x<width; x++){
+        for (uint y=0; y<height; y++)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = y + (offsetStart + x) * width;
+                
+            float deltaTmp = delta[offset];
+            float xHatTmp = xHat[offset];
+            float dxHat = Ɣ[depth] * deltaTmp;
+            tmp1 += dxHat;
+            tmp2 += dxHat * xHatTmp;
+            
+            tmp3 += deltaTmp * xHatTmp;
+            tmp4 += deltaTmp;
+        }}
+    }
+    sum1[depth] = tmp1;
+    sum2[depth] = tmp2;
+    
+    if (accumulate)
+    {
+        dƔ[depth] += tmp3;
+        dβ[depth] += tmp4;
+    }
+    else
+    {
+        dƔ[depth] = tmp3;
+        dβ[depth] = tmp4;
+    }
+}
+
+kernel void backwardBNConvTrainingHalf(
+    const device half * σ2,
+    const device half * xHat,
+    const device half * Ɣ,
+    const device half * sum1,
+    const device half * sum2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    device half * delta,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pDimensions &&
+        σ2 && xHat && Ɣ && sum1 && sum2 && delta)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    uint nbElems = nbBatch * width * height;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float mult = 1.0 / ((float)nbElems * sqrt(σ2[depth] + Ɛ));
+    float dxHat = Ɣ[depth] * delta[offset];
+    float tmp1 = nbElems * dxHat;
+    float tmp2 = sum1[depth];
+    float tmp3 = xHat[offset] * sum2[depth];
+    
+    delta[offset] = mult * (tmp1 - tmp2 - tmp3);
+}
+
+kernel void backwardBNConvInferenceHalf(
+    const device half * Ɣ,
+    const device half * Eσ2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pM,
+    constant uint * pDimensions,
+    device half * delta,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint m;
+    uint width;
+    uint height;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pM && pDimensions && Ɣ && Eσ2 && delta)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        m = *pM;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float Var = Eσ2[depth];
+    if (m > 1)
+    {
+        Var *= (float)m / ((float)m - 1);
+    }
+    float tmp1 = delta[offset];
+    float tmp2 = sqrt(Var + Ɛ);
+    float xhat = tmp1 / tmp2;
+    delta[offset] = Ɣ[depth] * xhat;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Biases.metal b/Sources/GrAIdient/Metal/Kernel/BiasesFloat.metal
similarity index 96%
rename from Sources/GrAIdient/Metal/Kernel/Biases.metal
rename to Sources/GrAIdient/Metal/Kernel/BiasesFloat.metal
index 31546305..fefd2da2 100644
--- a/Sources/GrAIdient/Metal/Kernel/Biases.metal
+++ b/Sources/GrAIdient/Metal/Kernel/BiasesFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void reduceBiases(
+kernel void reduceBiasesFloat(
     const device float * deltaWeights,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
diff --git a/Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal b/Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal
new file mode 100644
index 00000000..ba24365b
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal
@@ -0,0 +1,53 @@
+//
+// Biases.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void reduceBiasesHalf(
+    const device half * deltaWeights,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pAccumulate && deltaWeights && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbNeurons)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = depth + elem * nbNeurons;
+        tmp += deltaWeights[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[depth] += tmp;
+    }
+    else
+    {
+        grads[depth] = tmp;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Convolution.metal b/Sources/GrAIdient/Metal/Kernel/ConvolutionFloat.metal
similarity index 98%
rename from Sources/GrAIdient/Metal/Kernel/Convolution.metal
rename to Sources/GrAIdient/Metal/Kernel/ConvolutionFloat.metal
index 9a688895..2d942814 100644
--- a/Sources/GrAIdient/Metal/Kernel/Convolution.metal
+++ b/Sources/GrAIdient/Metal/Kernel/ConvolutionFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void convForward(
+kernel void convForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     const device float * biases,
@@ -104,7 +104,7 @@ kernel void convForward(
     outs[offset] = tmp;
 }
 
-kernel void conv16Forward(
+kernel void conv16ForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     const device float * biases,
@@ -206,7 +206,7 @@ kernel void conv16Forward(
     }
 }
 
-kernel void convBackward(
+kernel void convBackwardFloat(
     const device float * delta,
     const device float * weights,
     constant int * pStart,
@@ -313,7 +313,7 @@ kernel void convBackward(
     }
 }
 
-kernel void conv16Backward(
+kernel void conv16BackwardFloat(
     const device float * delta,
     const device float * weights,
     constant int * pStart,
@@ -428,7 +428,7 @@ kernel void conv16Backward(
     }
 }
 
-kernel void convBatchDerWeights(
+kernel void convBatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant int * pStart,
@@ -538,7 +538,7 @@ kernel void convBatchDerWeights(
     }
 }
 
-kernel void conv34BatchDerWeights(
+kernel void conv34BatchDerWeightsFloat(
     const device float4 * outsPrev,
     const device float4 * delta,
     constant uint * pNbChannels,
@@ -783,7 +783,7 @@ kernel void conv34BatchDerWeights(
     }
 }
 
-kernel void convBatchDerBiases(
+kernel void convBatchDerBiasesFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -838,7 +838,7 @@ kernel void convBatchDerBiases(
     }
 }
 
-kernel void convDerWeights(
+kernel void convDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant int * pStart,
@@ -938,7 +938,7 @@ kernel void convDerWeights(
     deltaWeights[offsetWeights] = tmp;
 }
 
-kernel void convDerBiases(
+kernel void convDerBiasesFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -982,7 +982,7 @@ kernel void convDerBiases(
     deltaWeights[offsetWeights] = tmp;
 }
 
-kernel void convReduceWeights(
+kernel void convReduceWeightsFloat(
     const device float * deltaWeights,
     constant uint * pNbChannels,
     constant uint * pNbChannelsPrev,
diff --git a/Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal b/Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal
new file mode 100644
index 00000000..95d03a60
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal
@@ -0,0 +1,1049 @@
+//
+// Convolution.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void convForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device half * biases,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch &&
+        outsPrev && weights && biases && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth+nbChannels*elem)*height;
+    
+    float tmp = biases[depth];
+    for (uint depthPrev=0; depthPrev<nbChannelsPrev; depthPrev++)
+    {
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev*elem) * heightPrev;
+        uint offsetStartWeights =
+            (depthPrev + nbChannelsPrev * depth) * weightHeight;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((int)(stride*j)+l-offJ >= 0 &&
+                (int)(stride*j)+l-offJ < (int)widthPrev &&
+                (int)(stride*i)+k-offI >= 0 &&
+                (int)(stride*i)+k-offI < (int)heightPrev)
+            {
+                uint offsetPrev = (int)(stride*j)+l-offJ +
+                    (offsetStartPrev + (int)(stride*i)+k-offI)*widthPrev;
+                float outPrev = outsPrev[offsetPrev];
+                
+                uint offsetWeights = l-startJ +
+                    (offsetStartWeights + k-startI) * weightWidth;
+                float w = weights[offsetWeights];
+                
+                tmp += outPrev * w;
+            }
+        }}
+    }
+    
+    uint offset = j + (offsetStart + i)*width;
+    outs[offset] = tmp;
+}
+
+kernel void conv16ForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device half * biases,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch &&
+        outsPrev && weights && biases && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+    }
+    else
+        return ;
+    
+    uint coeff = 16;
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth * coeff >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float tmp[16] = {0};
+    for (uint depthPrev=0; depthPrev<nbChannelsPrev; depthPrev++)
+    {
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev*elem) * heightPrev;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((int)(stride*j)+l-offJ >= 0 &&
+                (int)(stride*j)+l-offJ < (int)widthPrev &&
+                (int)(stride*i)+k-offI >= 0 &&
+                (int)(stride*i)+k-offI < (int)heightPrev)
+            {
+                uint offsetPrev = (int)(stride*j)+l-offJ +
+                    (offsetStartPrev + (int)(stride*i)+k-offI)*widthPrev;
+                float outPrev = outsPrev[offsetPrev];
+                
+                for (uint c=0; c<coeff; c++)
+                {
+                    uint offsetStartWeights = weightHeight *
+                        (depthPrev + nbChannelsPrev * (depth*coeff+c));
+                    uint offsetWeights = l-startJ +
+                        (offsetStartWeights + k-startI) * weightWidth;
+                    float w = weights[offsetWeights];
+                    
+                    tmp[c] += outPrev * w;
+                }
+            }
+        }}
+    }
+    
+    for (uint c=0; c<coeff; c++)
+    {
+        uint offsetStart = ((depth*coeff+c) + nbChannels * elem) * height;
+        uint offset = j + (offsetStart + i) * width;
+        outs[offset] = tmp[c] + biases[depth*coeff+c];
+    }
+}
+
+kernel void convBackwardHalf(
+    const device half * delta,
+    const device half * weights,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && pDirty &&
+        delta && weights && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depthPrev >= widthPrev * nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStartWeights =
+            (depthPrev + nbChannelsPrev * depth) * weightHeight;
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((i-k+offI) % stride == 0 && (j-l+offJ) % stride == 0)
+            {
+                int i1 = (i-k+offI) / stride;
+                int j1 = (j-l+offJ) / stride;
+                
+                if (j1 >= 0 && j1 < (int)width &&
+                    i1 >= 0 && i1 < (int)height)
+                {
+                    uint offset = j1 + (offsetStart + i1) * width;
+                    float deltaCur = delta[offset];
+                    
+                    uint offsetWeights = l-startJ +
+                        (offsetStartWeights + k-startI) * weightWidth;
+                    float w = weights[offsetWeights];
+                    
+                    tmp += deltaCur * w;
+                }
+            }
+        }}
+    }
+    
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += tmp;
+    }
+}
+
+kernel void conv16BackwardHalf(
+    const device half * delta,
+    const device half * weights,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && pDirty &&
+        delta && weights && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint coeff = 16;
+    uint depthPrev = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depthPrev * coeff >= widthPrev * nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    float tmp[16] = {0};
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((i-k+offI) % stride == 0 && (j-l+offJ) % stride == 0)
+            {
+                int i1 = (i-k+offI) / stride;
+                int j1 = (j-l+offJ) / stride;
+                
+                if (j1 >= 0 && j1 < (int)width &&
+                    i1 >= 0 && i1 < (int)height)
+                {
+                    uint offset = j1 + (offsetStart + i1) * width;
+                    float deltaCur = delta[offset];
+                    
+                    for (uint c=0; c<coeff; c++)
+                    {
+                        uint offsetStartWeights = weightHeight *
+                            ((depthPrev*coeff+c) + nbChannelsPrev * depth);
+                        uint offsetWeights = l-startJ +
+                            (offsetStartWeights + k-startI) * weightWidth;
+                        float w = weights[offsetWeights];
+                        
+                        tmp[c] += deltaCur * w;
+                    }
+                }
+            }
+        }}
+    }
+    
+    for (uint c=0; c<coeff; c++)
+    {
+        uint offsetStartPrev = heightPrev *
+            ((depthPrev*coeff+c) + nbChannelsPrev * elem);
+        uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+        
+        if (dirty)
+        {
+            deltaPrev[offsetPrev] = tmp[c];
+        }
+        else
+        {
+            deltaPrev[offsetPrev] += tmp[c];
+        }
+    }
+}
+
+kernel void convBatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev && pDimensions &&
+        pDimensionsPrev && pDimWeights && pNbBatch && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    int weightsI = id[1] / nbChannelsPrev;
+    int weightsJ = id[0] / nbChannels;
+    uint depth = id[0] % nbChannels;
+    uint depthPrev = id[1] % nbChannelsPrev;
+    
+    if (id[0] >= nbChannels * weightWidth ||
+        id[1] >= nbChannelsPrev * weightHeight ||
+        weightsI + startI > endI || weightsJ + startJ > endJ)
+    {
+        return ;
+    }
+    
+    int i = weightsI + startI;
+    int j = weightsJ + startJ;
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offsetStart =
+            (depth + nbChannels * elem) * height;
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev * elem) * heightPrev;
+        
+        for (uint k=0; k<height; k++){
+        for (uint l=0; l<width; l++)
+        {
+            if ((int)(stride*l)+j-offJ >= 0 &&
+                (int)(stride*l)+j-offJ < (int)widthPrev &&
+                (int)(stride*k)+i-offI >= 0 &&
+                (int)(stride*k)+i-offI < (int)heightPrev)
+            {
+                uint offset = l + (offsetStart + k) * width;
+                float deltaCur = delta[offset];
+                
+                uint offsetPrev = (int)(stride*l)+j-offJ +
+                    (offsetStartPrev + (int)(stride*k)+i-offI)*widthPrev;
+                float outPrev = outsPrev[offsetPrev];
+                
+                tmp += deltaCur * outPrev;
+            }
+        }}
+    }
+    
+    uint offsetStartWeights =
+        (depthPrev + nbChannelsPrev * depth) * weightHeight;
+    uint offsetWeights = j-startJ +
+        (offsetStartWeights + i-startI) * weightWidth;
+    
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
+
+kernel void conv34BatchDerWeightsHalf(
+    const device half4 * outsPrev,
+    const device half4 * delta,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbChannels && pNbChannelsPrev && pDimensions &&
+        pDimensionsPrev && pNbBatch && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint depthPrev = id[1];
+    
+    if (id[0] >= nbChannels ||
+        id[1] >= nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    float tmp[9] = {0.0};
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offsetStart =
+            (depth + nbChannels * elem) * height;
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev * elem) * heightPrev;
+        
+        for (uint k=0; k<height/2; k++){
+        for (uint l=0; l<width/4; l++)
+        {
+            uint offset4 = (l*4 + (offsetStart + k*2) * width) / 4;
+            uint offset7 = (l*4 + (offsetStart + k*2+1) * width) / 4;
+            half4 delta4 = delta[offset4];
+            half4 delta7 = delta[offset7];
+            
+            if (k > 0 && l > 0)
+            {
+                uint offsetPrev0 =
+                    ((l-1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4;
+                float outPrev0 = outsPrev[offsetPrev0][3];
+                
+                tmp[0] += outPrev0 * delta4[0];
+            }
+            if (k > 0)
+            {
+                uint offsetPrev1 =
+                    (l*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4;
+                half4 outPrev1 = outsPrev[offsetPrev1];
+                
+                tmp[0] += outPrev1[0] * delta4[1];
+                tmp[0] += outPrev1[1] * delta4[2];
+                tmp[0] += outPrev1[2] * delta4[3];
+                
+                half4 sum = outPrev1 * delta4;
+                tmp[1] += sum[0] + sum[1] + sum[2] + sum[3];
+                
+                tmp[2] += outPrev1[1] * delta4[0];
+                tmp[2] += outPrev1[2] * delta4[1];
+                tmp[2] += outPrev1[3] * delta4[2];
+            }
+            if (k > 0 && (l+1)*4 < width)
+            {
+                uint offsetPrev2 =
+                    ((l+1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4;
+                float outPrev2 = outsPrev[offsetPrev2][0];
+                
+                tmp[2] += outPrev2 * delta4[3];
+            }
+            
+            if (l > 0)
+            {
+                uint offsetPrev3 =
+                    ((l-1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4;
+                uint offsetPrev6 =
+                    ((l-1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4;
+                float outPrev3 = outsPrev[offsetPrev3][3];
+                float outPrev6 = outsPrev[offsetPrev6][3];
+                
+                tmp[0] += outPrev3 * delta7[0];
+                tmp[3] += outPrev3 * delta4[0];
+                tmp[3] += outPrev6 * delta7[0];
+                tmp[6] += outPrev6 * delta4[0];
+            }
+            
+            uint offsetPrev4 =
+                (l*4 + (offsetStartPrev + k*2) * widthPrev) / 4;
+            uint offsetPrev7 =
+                (l*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4;
+            half4 outPrev4 = outsPrev[offsetPrev4];
+            half4 outPrev7 = outsPrev[offsetPrev7];
+            
+            tmp[0] += outPrev4[0] * delta7[1];
+            tmp[0] += outPrev4[1] * delta7[2];
+            tmp[0] += outPrev4[2] * delta7[3];
+            
+            half4 sum = outPrev4 * delta7;
+            tmp[1] += sum[0] + sum[1] + sum[2] + sum[3];
+            
+            tmp[2] += outPrev4[1] * delta7[0];
+            tmp[2] += outPrev4[2] * delta7[1];
+            tmp[2] += outPrev4[3] * delta7[2];
+            
+            tmp[3] += outPrev4[0] * delta4[1];
+            tmp[3] += outPrev4[1] * delta4[2];
+            tmp[3] += outPrev4[2] * delta4[3];
+            tmp[3] += outPrev7[0] * delta7[1];
+            tmp[3] += outPrev7[1] * delta7[2];
+            tmp[3] += outPrev7[2] * delta7[3];
+            
+            sum = outPrev4 * delta4;
+            tmp[4] += sum[0] + sum[1] + sum[2] + sum[3];
+            sum = outPrev7 * delta7;
+            tmp[4] += sum[0] + sum[1] + sum[2] + sum[3];
+            
+            tmp[5] += outPrev4[1] * delta4[0];
+            tmp[5] += outPrev4[2] * delta4[1];
+            tmp[5] += outPrev4[3] * delta4[2];
+            tmp[5] += outPrev7[1] * delta7[0];
+            tmp[5] += outPrev7[2] * delta7[1];
+            tmp[5] += outPrev7[3] * delta7[2];
+            
+            tmp[6] += outPrev7[0] * delta4[1];
+            tmp[6] += outPrev7[1] * delta4[2];
+            tmp[6] += outPrev7[2] * delta4[3];
+            
+            sum = outPrev7 * delta4;
+            tmp[7] += sum[0] + sum[1] + sum[2] + sum[3];
+            
+            tmp[8] += outPrev7[1] * delta4[0];
+            tmp[8] += outPrev7[2] * delta4[1];
+            tmp[8] += outPrev7[3] * delta4[2];
+            
+            if ((l+1)*4 < width)
+            {
+                uint offsetPrev5 =
+                    ((l+1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4;
+                uint offsetPrev8 =
+                    ((l+1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4;
+                float outPrev5 = outsPrev[offsetPrev5][0];
+                float outPrev8 = outsPrev[offsetPrev8][0];
+                
+                tmp[2] += outPrev5 * delta7[3];
+                tmp[5] += outPrev5 * delta4[3];
+                tmp[5] += outPrev8 * delta7[3];
+                tmp[8] += outPrev8 * delta4[3];
+            }
+            
+            if ((k+1)*2 < height && l > 0)
+            {
+                uint offsetPrev9 =
+                    ((l-1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4;
+                float outPrev9 = outsPrev[offsetPrev9][3];
+                
+                tmp[6] += outPrev9 * delta7[0];
+            }
+            if ((k+1)*2 < height)
+            {
+                uint offsetPrev10 =
+                    (l*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4;
+                half4 outPrev10 = outsPrev[offsetPrev10];
+                
+                tmp[6] += outPrev10[0] * delta7[1];
+                tmp[6] += outPrev10[1] * delta7[2];
+                tmp[6] += outPrev10[2] * delta7[3];
+                
+                half4 sum = outPrev10 * delta7;
+                tmp[7] += sum[0] + sum[1] + sum[2] + sum[3];
+                
+                tmp[8] += outPrev10[1] * delta7[0];
+                tmp[8] += outPrev10[2] * delta7[1];
+                tmp[8] += outPrev10[3] * delta7[2];
+            }
+            if ((k+1)*2 < height && (l+1)*4 < width)
+            {
+                uint offsetPrev11 =
+                    ((l+1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4;
+                float outPrev11 = outsPrev[offsetPrev11][0];
+                
+                tmp[8] += outPrev11 * delta7[3];
+            }
+        }}
+    }
+    
+    uint offsetStartWeights = (depthPrev + nbChannelsPrev * depth) * 3;
+    uint offsetWeights0 = 0 + (offsetStartWeights + 0) * 3;
+    uint offsetWeights1 = 1 + (offsetStartWeights + 0) * 3;
+    uint offsetWeights2 = 2 + (offsetStartWeights + 0) * 3;
+    uint offsetWeights3 = 0 + (offsetStartWeights + 1) * 3;
+    uint offsetWeights4 = 1 + (offsetStartWeights + 1) * 3;
+    uint offsetWeights5 = 2 + (offsetStartWeights + 1) * 3;
+    uint offsetWeights6 = 0 + (offsetStartWeights + 2) * 3;
+    uint offsetWeights7 = 1 + (offsetStartWeights + 2) * 3;
+    uint offsetWeights8 = 2 + (offsetStartWeights + 2) * 3;
+    
+    if (accumulate)
+    {
+        grads[offsetWeights0] += tmp[0];
+        grads[offsetWeights1] += tmp[1];
+        grads[offsetWeights2] += tmp[2];
+        grads[offsetWeights3] += tmp[3];
+        grads[offsetWeights4] += tmp[4];
+        grads[offsetWeights5] += tmp[5];
+        grads[offsetWeights6] += tmp[6];
+        grads[offsetWeights7] += tmp[7];
+        grads[offsetWeights8] += tmp[8];
+    }
+    else
+    {
+        grads[offsetWeights0] = tmp[0];
+        grads[offsetWeights1] = tmp[1];
+        grads[offsetWeights2] = tmp[2];
+        grads[offsetWeights3] = tmp[3];
+        grads[offsetWeights4] = tmp[4];
+        grads[offsetWeights5] = tmp[5];
+        grads[offsetWeights6] = tmp[6];
+        grads[offsetWeights7] = tmp[7];
+        grads[offsetWeights8] = tmp[8];
+    }
+}
+
+kernel void convBatchDerBiasesHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pAccumulate &&
+        delta && grads)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbChannels)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (uint k=0; k<height; k++){
+        for (uint l=0; l<width; l++)
+        {
+            uint offset = l + (offsetStart + k) * width;
+            tmp += delta[offset];
+        }}
+    }
+    
+    if (accumulate)
+    {
+        grads[depth] += tmp;
+    }
+    else
+    {
+        grads[depth] = tmp;
+    }
+}
+
+kernel void convDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    int offI, offJ;
+    uint stride;
+    uint nbBatch;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev && pDimensions &&
+        pDimensionsPrev && pDimWeights && pNbBatch &&
+        outsPrev && delta && deltaWeights)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        offI = pStart[4];
+        offJ = pStart[5];
+        stride = pStride[0];
+    }
+    else
+        return ;
+    
+    uint remains = id[0];
+    uint elem = remains / (weightWidth * nbChannels);
+    remains = remains % (weightWidth * nbChannels);
+    int weightsI = id[1] / nbChannelsPrev;
+    int weightsJ = remains / nbChannels;
+    uint depth = remains % nbChannels;
+    uint depthPrev = id[1] % nbChannelsPrev;
+    
+    if (id[0] >= nbBatch * nbChannels * weightWidth ||
+        id[1] >= nbChannelsPrev * weightHeight ||
+        weightsI + startI > endI || weightsJ + startJ > endJ)
+    {
+        return ;
+    }
+    
+    uint offsetStartGridWeights =
+        elem * nbChannels * nbChannelsPrev * weightHeight;
+    
+    int i = weightsI + startI;
+    int j = weightsJ + startJ;
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+    uint offsetStartWeights =
+        (depthPrev + nbChannelsPrev * depth) * weightHeight;
+    
+    float tmp = 0.0;
+    for (uint k=0; k<height; k++){
+    for (uint l=0; l<width; l++)
+    {
+        if ((int)(stride*l)+j-offJ >= 0 &&
+            (int)(stride*l)+j-offJ < (int)widthPrev &&
+            (int)(stride*k)+i-offI >= 0 &&
+            (int)(stride*k)+i-offI < (int)heightPrev)
+        {
+            uint offset = l + (offsetStart + k) * width;
+            float deltaCur = delta[offset];
+            
+            uint offsetPrev = (int)(stride*l)+j-offJ +
+                (offsetStartPrev + (int)(stride*k)+i-offI)*widthPrev;
+            float outPrev = outsPrev[offsetPrev];
+            
+            tmp += deltaCur * outPrev;
+        }
+    }}
+    
+    uint offsetWeights = j-startJ +
+        (offsetStartGridWeights+offsetStartWeights+i-startI)*weightWidth;
+    deltaWeights[offsetWeights] = tmp;
+}
+
+kernel void convDerBiasesHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch && delta && deltaWeights)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbChannels || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    
+    float tmp = 0.0;
+    for (uint i=0; i<height; i++){
+    for (uint j=0; j<width; j++)
+    {
+        uint offset = j + (offsetStart + i) * width;
+        tmp += delta[offset];
+    }}
+    
+    uint offsetWeights = elem * nbChannels + depth;
+    deltaWeights[offsetWeights] = tmp;
+}
+
+kernel void convReduceWeightsHalf(
+    const device half * deltaWeights,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbChannelsPrev;
+    uint weightHeight, weightWidth;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbChannels && pNbChannelsPrev && pDimWeights && pNbBatch &&
+        pAccumulate && deltaWeights && grads)
+    {
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint weightsI = id[1] / nbChannelsPrev;
+    uint weightsJ = id[0] / nbChannels;
+    uint depth = id[0] % nbChannels;
+    uint depthPrev = id[1] % nbChannelsPrev;
+    
+    if (id[0] >= nbChannels * weightWidth ||
+        id[1] >= nbChannelsPrev * weightHeight)
+    {
+        return ;
+    }
+    
+    uint offsetStartWeights =
+        (depthPrev + nbChannelsPrev * depth) * weightHeight;
+    uint offsetWeights = weightsJ +
+        (offsetStartWeights + weightsI) * weightWidth;
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offsetStart =
+            elem * nbChannels * nbChannelsPrev * weightHeight;
+        uint offset = weightsJ +
+            (offsetStart + offsetStartWeights + weightsI) * weightWidth;
+                
+        tmp += deltaWeights[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Deconvolution.metal b/Sources/GrAIdient/Metal/Kernel/DeconvolutionFloat.metal
similarity index 98%
rename from Sources/GrAIdient/Metal/Kernel/Deconvolution.metal
rename to Sources/GrAIdient/Metal/Kernel/DeconvolutionFloat.metal
index 28308ee0..f94638ed 100644
--- a/Sources/GrAIdient/Metal/Kernel/Deconvolution.metal
+++ b/Sources/GrAIdient/Metal/Kernel/DeconvolutionFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void deconvForward(
+kernel void deconvForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     const device float * biases,
@@ -105,7 +105,7 @@ kernel void deconvForward(
     outs[offset] = tmp;
 }
 
-kernel void deconvBackward(
+kernel void deconvBackwardFloat(
     const device float * delta,
     const device float * weights,
     constant int * pStart,
@@ -206,7 +206,7 @@ kernel void deconvBackward(
     }
 }
 
-kernel void deconvBatchDerWeights(
+kernel void deconvBatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant int * pStart,
@@ -317,7 +317,7 @@ kernel void deconvBatchDerWeights(
     }
 }
 
-kernel void deconvDerWeights(
+kernel void deconvDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant int * pStart,
diff --git a/Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal b/Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal
new file mode 100644
index 00000000..2708d252
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal
@@ -0,0 +1,419 @@
+//
+// Deconvolution.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 28/12/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void deconvForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device half * biases,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    uint stride;
+    uint nbBatch;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch &&
+        outsPrev && weights && biases && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        stride = pStride[0];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth+nbChannels*elem)*height;
+    
+    float tmp = biases[depth];
+    for (uint depthPrev=0; depthPrev<nbChannelsPrev; depthPrev++)
+    {
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev*elem) * heightPrev;
+        uint offsetStartWeights =
+            (depthPrev + nbChannelsPrev * depth) * weightHeight;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((i-k+startI) % stride == 0 && (j-l+startJ) % stride == 0)
+            {
+                int i1 = (i-k+startI) / stride;
+                int j1 = (j-l+startJ) / stride;
+                
+                if (j1 >= 0 && j1 < (int)widthPrev &&
+                    i1 >= 0 && i1 < (int)heightPrev)
+                {
+                    uint offsetPrev = j1 +
+                        (offsetStartPrev + i1) * widthPrev;
+                    float outPrev = outsPrev[offsetPrev];
+                    
+                    uint offsetWeights = l-startJ +
+                        (offsetStartWeights + k-startI) * weightWidth;
+                    float w = weights[offsetWeights];
+                    
+                    tmp += outPrev * w;
+                }
+            }
+        }}
+    }
+    
+    uint offset = j + (offsetStart + i)*width;
+    outs[offset] = tmp;
+}
+
+kernel void deconvBackwardHalf(
+    const device half * delta,
+    const device half * weights,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    uint stride;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev &&
+        pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && pDirty &&
+        delta && weights && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        stride = pStride[0];
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depthPrev >= widthPrev * nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStartWeights =
+            (depthPrev + nbChannelsPrev * depth) * weightHeight;
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (int k=startI; k<=endI; k++){
+        for (int l=startJ; l<=endJ; l++)
+        {
+            if ((int)(stride*j)+l-startJ >= 0 &&
+                (int)(stride*j)+l-startJ < (int)width &&
+                (int)(stride*i)+k-startI >= 0 &&
+                (int)(stride*i)+k-startI < (int)height)
+            {
+                uint offset = (int)(stride*j)+l-startJ +
+                    (offsetStart + (int)(stride*i)+k-startI) * width;
+                float deltaCur = delta[offset];
+                
+                uint offsetWeights = l-startJ +
+                    (offsetStartWeights + k-startI) * weightWidth;
+                float w = weights[offsetWeights];
+                
+                tmp += deltaCur * w;
+            }
+        }}
+    }
+    
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += tmp;
+    }
+}
+
+kernel void deconvBatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    uint stride;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev && pDimensions &&
+        pDimensionsPrev && pDimWeights && pNbBatch && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        stride = pStride[0];
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    int weightsI = id[1] / nbChannelsPrev;
+    int weightsJ = id[0] / nbChannels;
+    uint depth = id[0] % nbChannels;
+    uint depthPrev = id[1] % nbChannelsPrev;
+    
+    if (id[0] >= nbChannels * weightWidth ||
+        id[1] >= nbChannelsPrev * weightHeight ||
+        weightsI + startI > endI || weightsJ + startJ > endJ)
+    {
+        return ;
+    }
+    
+    int i = weightsI + startI;
+    int j = weightsJ + startJ;
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offsetStart =
+            (depth + nbChannels * elem) * height;
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev * elem) * heightPrev;
+        
+        for (uint k=0; k<height; k++){
+        for (uint l=0; l<width; l++)
+        {
+            if ((k-i+startI) % stride == 0 && (l-j+startJ) % stride == 0)
+            {
+                int i1 = (k-i+startI) / stride;
+                int j1 = (l-j+startJ) / stride;
+                
+                if (j1 >= 0 && j1 < (int)widthPrev &&
+                    i1 >= 0 && i1 < (int)heightPrev)
+                {
+                    uint offset = l + (offsetStart + k) * width;
+                    float deltaCur = delta[offset];
+                    
+                    uint offsetPrev = j1 +
+                        (offsetStartPrev + i1)*widthPrev;
+                    float outPrev = outsPrev[offsetPrev];
+                    
+                    tmp += deltaCur * outPrev;
+                }
+            }
+        }}
+    }
+    
+    uint offsetStartWeights =
+        (depthPrev + nbChannelsPrev * depth) * weightHeight;
+    uint offsetWeights = j-startJ +
+        (offsetStartWeights + i-startI) * weightWidth;
+    
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
+
+kernel void deconvDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimWeights,
+    constant uint * pNbBatch,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint weightHeight, weightWidth;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    int startI, startJ;
+    int endI, endJ;
+    uint stride;
+    uint nbBatch;
+    
+    if (pStart && pStride && pNbChannels && pNbChannelsPrev && pDimensions &&
+        pDimensionsPrev && pDimWeights && pNbBatch &&
+        outsPrev && delta && deltaWeights)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        weightWidth = pDimWeights[0];
+        weightHeight = pDimWeights[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        startI = pStart[0];
+        endI = pStart[1];
+        startJ = pStart[2];
+        endJ = pStart[3];
+        stride = pStride[0];
+    }
+    else
+        return ;
+    
+    uint remains = id[0];
+    uint elem = remains / (weightWidth * nbChannels);
+    remains = remains % (weightWidth * nbChannels);
+    int weightsI = id[1] / nbChannelsPrev;
+    int weightsJ = remains / nbChannels;
+    uint depth = remains % nbChannels;
+    uint depthPrev = id[1] % nbChannelsPrev;
+    
+    if (id[0] >= nbBatch * nbChannels * weightWidth ||
+        id[1] >= nbChannelsPrev * weightHeight ||
+        weightsI + startI > endI || weightsJ + startJ > endJ)
+    {
+        return ;
+    }
+    
+    uint offsetStartGridWeights =
+        elem * nbChannels * nbChannelsPrev * weightHeight;
+    
+    int i = weightsI + startI;
+    int j = weightsJ + startJ;
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+    uint offsetStartWeights =
+        (depthPrev + nbChannelsPrev * depth) * weightHeight;
+    
+    float tmp = 0.0;
+    for (uint k=0; k<height; k++){
+    for (uint l=0; l<width; l++)
+    {
+        if ((k-i+startI) % stride == 0 && (l-j+startJ) % stride == 0)
+        {
+            int i1 = (k-i+startI) / stride;
+            int j1 = (l-j+startJ) / stride;
+            
+            if (j1 >= 0 && j1 < (int)widthPrev &&
+                i1 >= 0 && i1 < (int)heightPrev)
+            {
+                uint offset = l + (offsetStart + k) * width;
+                float deltaCur = delta[offset];
+                
+                uint offsetPrev = j1 +
+                    (offsetStartPrev + i1)*widthPrev;
+                float outPrev = outsPrev[offsetPrev];
+                
+                tmp += deltaCur * outPrev;
+            }
+        }
+    }}
+    
+    uint offsetWeights = j-startJ +
+        (offsetStartGridWeights+offsetStartWeights+i-startI)*weightWidth;
+    deltaWeights[offsetWeights] = tmp;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnected.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/FullyConnected.metal
rename to Sources/GrAIdient/Metal/Kernel/FullyConnectedFloat.metal
index 7f12744a..e7abeb06 100644
--- a/Sources/GrAIdient/Metal/Kernel/FullyConnected.metal
+++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void flForward(
+kernel void flForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     const device float * biases,
@@ -56,7 +56,7 @@ kernel void flForward(
     outs[offset] = tmp;
 }
 
-kernel void flBackward(
+kernel void flBackwardFloat(
     const device float * delta,
     const device float * weights,
     constant uint * pNbNeurons,
@@ -113,7 +113,7 @@ kernel void flBackward(
     }
 }
 
-kernel void flBatchDerWeights(
+kernel void flBatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbNeurons,
@@ -170,7 +170,7 @@ kernel void flBatchDerWeights(
     }
 }
 
-kernel void flBatchDerBiases(
+kernel void flBatchDerBiasesFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -214,7 +214,7 @@ kernel void flBatchDerBiases(
     }
 }
 
-kernel void flDerWeights(
+kernel void flDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbNeurons,
@@ -261,7 +261,7 @@ kernel void flDerWeights(
     deltaWeights[offsetWeights] = tmp;
 }
 
-kernel void flDerBiases(
+kernel void flDerBiasesFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -294,7 +294,7 @@ kernel void flDerBiases(
     deltaWeights[offsetWeights] = deltaCur;
 }
 
-kernel void flReduceWeights(
+kernel void flReduceWeightsFloat(
     const device float * deltaWeights,
     constant uint * pNbNeurons,
     constant uint * pNbNeuronsPrev,
diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal
new file mode 100644
index 00000000..63c717f9
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal
@@ -0,0 +1,347 @@
+//
+// FullyConnected.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void flForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device half * biases,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch &&
+        outsPrev && weights && biases && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = biases[depth];
+    for (uint depthPrev=0; depthPrev<nbNeuronsPrev; depthPrev++)
+    {
+        uint offsetPrev = depthPrev + nbNeuronsPrev * elem;
+        uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+        
+        float outPrev = outsPrev[offsetPrev];
+        float w = weights[offsetWeights];
+                
+        tmp += outPrev * w;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void flBackwardHalf(
+    const device half * delta,
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pDirty &&
+        deltaPrev && weights && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0];
+    uint elem = id[1];
+    
+    if (depthPrev >= nbNeuronsPrev || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth + nbNeurons * elem;
+        uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+        
+        float deltaCur = delta[offset];
+        float w = weights[offsetWeights];
+        
+        tmp += w * deltaCur;
+    }
+    
+    uint offsetPrev = depthPrev + nbNeuronsPrev * elem;
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += tmp;
+    }
+}
+
+kernel void flBatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint depthPrev = id[1];
+    
+    if (depth >= nbNeurons || depthPrev >= nbNeuronsPrev)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = depth + nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        uint offsetPrev = depthPrev + nbNeuronsPrev * elem;
+        float outPrev = outsPrev[offsetPrev];
+        
+        tmp += deltaCur * outPrev;
+    }
+    
+    uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
+
+kernel void flBatchDerBiasesHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pAccumulate && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbNeurons)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = depth + nbNeurons * elem;
+        tmp += delta[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[depth] += tmp;
+    }
+    else
+    {
+        grads[depth] = tmp;
+    }
+}
+
+kernel void flDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch &&
+        outsPrev && delta && deltaWeights)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[0] / nbNeurons;
+    uint depth = id[0] % nbNeurons;
+    uint depthPrev = id[1];
+    
+    if (depth * elem >= nbNeurons * nbBatch ||
+        depthPrev >= nbNeuronsPrev)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    float deltaCur = delta[offset];
+    
+    uint offsetPrev = depthPrev + nbNeuronsPrev * elem;
+    float outPrev = outsPrev[offsetPrev];
+    
+    float tmp = deltaCur * outPrev;
+    
+    uint offsetStartWeights = elem * nbNeurons * nbNeuronsPrev;
+    uint offsetWeights = offsetStartWeights +
+        depthPrev + nbNeuronsPrev * depth;
+    deltaWeights[offsetWeights] = tmp;
+}
+
+kernel void flDerBiasesHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbBatch && delta && deltaWeights)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    float deltaCur = delta[offset];
+    
+    uint offsetWeights = elem * nbNeurons + depth;
+    deltaWeights[offsetWeights] = deltaCur;
+}
+
+kernel void flReduceWeightsHalf(
+    const device half * deltaWeights,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pAccumulate &&
+        deltaWeights && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint depthPrev = id[1];
+    
+    if (depth >= nbNeurons || depthPrev >= nbNeuronsPrev)
+    {
+        return ;
+    }
+    
+    uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+        
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = elem * nbNeurons * nbNeuronsPrev + offsetWeights;
+        tmp += deltaWeights[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatch.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/FullyConnectedPatch.metal
rename to Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchFloat.metal
index c827f08c..ec176a91 100644
--- a/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatch.metal
+++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void flPatchForward(
+kernel void flPatchForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     const device float * biases,
@@ -85,7 +85,7 @@ kernel void flPatchForward(
     outs[offset] = tmp;
 }
 
-kernel void flPatchBackward(
+kernel void flPatchBackwardFloat(
     const device float * delta,
     const device float * weights,
     constant uint * pNbNeurons,
@@ -170,7 +170,7 @@ kernel void flPatchBackward(
     }
 }
 
-kernel void flPatchBatchDerWeights(
+kernel void flPatchBatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbNeurons,
@@ -256,7 +256,7 @@ kernel void flPatchBatchDerWeights(
     }
 }
 
-kernel void flPatchBatchDerBiases(
+kernel void flPatchBatchDerBiasesFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -304,7 +304,7 @@ kernel void flPatchBatchDerBiases(
     }
 }
 
-kernel void flPatchBatch4DerBiases(
+kernel void flPatchBatch4DerBiasesFloat(
     const device float4 * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -353,7 +353,7 @@ kernel void flPatchBatch4DerBiases(
     }
 }
 
-kernel void flPatchDerWeights(
+kernel void flPatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbNeurons,
@@ -432,7 +432,7 @@ kernel void flPatchDerWeights(
     deltaWeights[offsetWeights] = tmp;
 }
 
-kernel void flPatchDerBiases(
+kernel void flPatchDerBiasesFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -472,7 +472,7 @@ kernel void flPatchDerBiases(
     deltaWeights[offsetWeights] = tmp;
 }
 
-kernel void flPatchReduceWeights(
+kernel void flPatchReduceWeightsFloat(
     const device float * deltaWeights,
     constant uint * pNbNeurons,
     constant uint * pNbChannelsPrev,
diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal
new file mode 100644
index 00000000..4a6c3e36
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal
@@ -0,0 +1,529 @@
+//
+// FullyConnectedPatch.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 25/02/2023.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void flPatchForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device half * biases,
+    constant uint * pNbNeurons,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensionsPrev,
+    constant uint * pPatch,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbChannelsPrev;
+    uint heightPrev, widthPrev;
+    uint patch;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbChannelsPrev && pDimensionsPrev && pPatch &&
+        pNbBatch && pSequence &&
+        outsPrev && weights && biases && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbChannelsPrev = *pNbChannelsPrev;
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        patch = *pPatch;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint weightWidth = nbChannelsPrev * patch * patch;
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint nbSeqPerCol = widthPrev / patch;
+    uint seqI = seq / nbSeqPerCol;
+    uint seqJ = seq % nbSeqPerCol;
+    
+    uint iStart = seqI * patch;
+    uint jStart = seqJ * patch;
+    
+    float tmp = biases[depth];
+    for (uint depthPrev=0; depthPrev<nbChannelsPrev; depthPrev++)
+    {
+        uint offsetStartPrev =
+            (depthPrev + nbChannelsPrev * elem) * heightPrev;
+        
+        for (uint i=0; i<patch; i++) {
+        for (uint j=0; j<patch; j++)
+        {
+            uint offsetPrev = jStart+j +
+                (offsetStartPrev + iStart+i) * widthPrev;
+            float outPrev = outsPrev[offsetPrev];
+            
+            uint offsetWeight = j + i * patch + depthPrev * patch * patch;
+            uint offsetWeights = offsetWeight + weightWidth * depth;
+            float w = weights[offsetWeights];
+            
+            tmp += outPrev * w;
+        }}
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void flPatchBackwardHalf(
+    const device half * delta,
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensionsPrev,
+    constant uint * pPatch,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbChannelsPrev;
+    uint heightPrev, widthPrev;
+    uint patch;
+    uint nbBatch;
+    uint sequence;
+    uint dirty;
+    
+    if (pNbNeurons && pNbChannelsPrev && pPatch && pNbBatch && pDirty &&
+        deltaPrev && weights && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbChannelsPrev = *pNbChannelsPrev;
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        patch = *pPatch;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint weightWidth = nbChannelsPrev * patch * patch;
+    uint offsetWeight = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (offsetWeight >= weightWidth || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint nbSeqPerCol = widthPrev / patch;
+    uint seqI = seq / nbSeqPerCol;
+    uint seqJ = seq % nbSeqPerCol;
+    
+    uint iStart = seqI * patch;
+    uint jStart = seqJ * patch;
+    
+    uint res = offsetWeight;
+    uint depthPrev = res / (patch * patch);
+    res -= depthPrev * patch * patch;
+    uint i = res / patch;
+    res -= i * patch;
+    uint j = res;
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offsetWeights = offsetWeight + weightWidth * depth;
+        float w = weights[offsetWeights];
+        
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        tmp += w * deltaCur;
+    }
+    
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+    uint offsetPrev = jStart+j + (offsetStartPrev + iStart+i) * widthPrev;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += tmp;
+    }
+}
+
+kernel void flPatchBatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensionsPrev,
+    constant uint * pPatch,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbChannelsPrev;
+    uint heightPrev, widthPrev;
+    uint patch;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbChannelsPrev && pDimensionsPrev && pPatch &&
+        pNbBatch && pSequence && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbChannelsPrev = *pNbChannelsPrev;
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        patch = *pPatch;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint weightWidth = nbChannelsPrev * patch * patch;
+    uint depth = id[0];
+    uint offsetWeight = id[1];
+    
+    if (depth >= nbNeurons || offsetWeight >= weightWidth)
+    {
+        return ;
+    }
+    
+    uint nbSeqPerCol = widthPrev / patch;
+    
+    uint res = offsetWeight;
+    uint depthPrev = res / (patch * patch);
+    res -= depthPrev * patch * patch;
+    uint i = res / patch;
+    res -= i * patch;
+    uint j = res;
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint seqI = seq / nbSeqPerCol;
+        uint seqJ = seq % nbSeqPerCol;
+        
+        uint iStart = seqI * patch;
+        uint jStart = seqJ * patch;
+        
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+        uint offsetPrev = jStart+j + (offsetStartPrev + iStart+i) * widthPrev;
+        float outPrev = outsPrev[offsetPrev];
+        
+        tmp += outPrev * deltaCur;
+    }}
+    
+    uint offsetWeights = offsetWeight + weightWidth * depth;
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
+
+kernel void flPatchBatchDerBiasesHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pSequence && pAccumulate && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbNeurons)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        tmp += delta[offset];
+    }}
+    
+    if (accumulate)
+    {
+        grads[depth] += tmp;
+    }
+    else
+    {
+        grads[depth] = tmp;
+    }
+}
+
+kernel void flPatchBatch4DerBiasesHalf(
+    const device half4 * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half4 * grads,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pSequence && pAccumulate && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth * 4 >= nbNeurons)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset =
+            (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        tmp += delta[offset];
+    }}
+    
+    if (accumulate)
+    {
+        grads[depth] += tmp;
+    }
+    else
+    {
+        grads[depth] = tmp;
+    }
+}
+
+kernel void flPatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensionsPrev,
+    constant uint * pPatch,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbChannelsPrev;
+    uint heightPrev, widthPrev;
+    uint patch;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbChannelsPrev && pDimensionsPrev && pPatch &&
+        pNbBatch && pSequence &&
+        outsPrev && delta && deltaWeights)
+    {
+        nbNeurons = *pNbNeurons;
+        nbChannelsPrev = *pNbChannelsPrev;
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        patch = *pPatch;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint weightWidth = nbChannelsPrev * patch * patch;
+    uint depth = id[0] % nbNeurons;
+    uint offsetWeight = id[1];
+    uint elem = id[0] / nbNeurons;
+    
+    if (depth * elem >= nbNeurons * nbBatch ||
+        offsetWeight >= weightWidth)
+    {
+        return ;
+    }
+    
+    uint nbSeqPerCol = widthPrev / patch;
+    
+    uint res = offsetWeight;
+    uint depthPrev = res / (patch * patch);
+    res -= depthPrev * patch * patch;
+    uint i = res / patch;
+    res -= i * patch;
+    uint j = res;
+    
+    float tmp = 0.0;
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint seqI = seq / nbSeqPerCol;
+        uint seqJ = seq % nbSeqPerCol;
+        
+        uint iStart = seqI * patch;
+        uint jStart = seqJ * patch;
+        
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+        uint offsetPrev = jStart+j + (offsetStartPrev + iStart+i) * widthPrev;
+        float outPrev = outsPrev[offsetPrev];
+        
+        tmp += outPrev * deltaCur;
+    }
+    
+    uint offsetStartWeights = elem * nbNeurons * weightWidth;
+    uint offsetWeights = offsetStartWeights +
+        offsetWeight + weightWidth * depth;
+    deltaWeights[offsetWeights] = tmp;
+}
+
+kernel void flPatchDerBiasesHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && delta && deltaWeights)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        tmp += delta[offset];
+    }
+    
+    uint offsetWeights = elem * nbNeurons + depth;
+    deltaWeights[offsetWeights] = tmp;
+}
+
+kernel void flPatchReduceWeightsHalf(
+    const device half * deltaWeights,
+    constant uint * pNbNeurons,
+    constant uint * pNbChannelsPrev,
+    constant uint * pPatch,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbChannelsPrev;
+    uint patch;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbChannelsPrev && pPatch && pNbBatch && pAccumulate &&
+        deltaWeights && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbChannelsPrev = *pNbChannelsPrev;
+        patch = *pPatch;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint weightWidth = nbChannelsPrev * patch * patch;
+    uint depth = id[0];
+    uint offsetWeight = id[1];
+    
+    if (depth >= nbNeurons || offsetWeight >= weightWidth)
+    {
+        return ;
+    }
+    
+    uint offsetWeights = offsetWeight + weightWidth * depth;
+        
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = elem * nbNeurons * weightWidth + offsetWeights;
+        tmp += deltaWeights[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeq.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/FullyConnectedSeq.metal
rename to Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqFloat.metal
index 0b87e093..987d3b0f 100644
--- a/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeq.metal
+++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void flSeqForward(
+kernel void flSeqForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     const device float * biases,
@@ -61,7 +61,7 @@ kernel void flSeqForward(
     outs[offset] = tmp;
 }
 
-kernel void flSeq48Forward(
+kernel void flSeq48ForwardFloat(
     const device float4 * outsPrev,
     const device float4 * weights,
     const device float * biases,
@@ -123,7 +123,7 @@ kernel void flSeq48Forward(
     }
 }
 
-kernel void flSeq4Forward(
+kernel void flSeq4ForwardFloat(
     const device float4 * outsPrev,
     const device float4 * weights,
     const device float * biases,
@@ -176,7 +176,7 @@ kernel void flSeq4Forward(
     outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3] + biases[depth];
 }
 
-kernel void flSeqBackward(
+kernel void flSeqBackwardFloat(
     const device float * delta,
     const device float * weights,
     constant uint * pNbNeurons,
@@ -239,7 +239,7 @@ kernel void flSeqBackward(
     }
 }
 
-kernel void flSeq48Backward(
+kernel void flSeq48BackwardFloat(
     const device float * delta,
     const device float4 * weights,
     constant uint * pNbNeurons,
@@ -315,7 +315,7 @@ kernel void flSeq48Backward(
     }
 }
 
-kernel void flSeq4Backward(
+kernel void flSeq4BackwardFloat(
     const device float * delta,
     const device float4 * weights,
     constant uint * pNbNeurons,
@@ -378,7 +378,7 @@ kernel void flSeq4Backward(
     }
 }
 
-kernel void flSeqBatchDerWeights(
+kernel void flSeqBatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbNeurons,
@@ -440,7 +440,7 @@ kernel void flSeqBatchDerWeights(
     }
 }
 
-kernel void flSeqBatch4DerWeights(
+kernel void flSeqBatch4DerWeightsFloat(
     const device float4 * outsPrev,
     const device float * delta,
     constant uint * pNbNeurons,
@@ -502,7 +502,7 @@ kernel void flSeqBatch4DerWeights(
     }
 }
 
-kernel void flSeqDerWeights(
+kernel void flSeqDerWeightsFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbNeurons,
@@ -556,7 +556,7 @@ kernel void flSeqDerWeights(
     deltaWeights[offsetWeights] = tmp;
 }
 
-kernel void flSeqReduceWeights(
+kernel void flSeqReduceWeightsFloat(
     const device float * deltaWeights,
     constant uint * pNbNeurons,
     constant uint * pNbNeuronsPrev,
diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal
new file mode 100644
index 00000000..658d30de
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal
@@ -0,0 +1,609 @@
+//
+// FullyConnectedSeq.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 08/03/2023.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void flSeqForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device half * biases,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        outsPrev && weights && biases && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = biases[depth];
+    for (uint depthPrev=0; depthPrev<nbNeuronsPrev; depthPrev++)
+    {
+        uint offsetPrev = depthPrev + nbNeuronsPrev * seq +
+            sequence * nbNeuronsPrev * elem;
+        float outPrev = outsPrev[offsetPrev];
+        
+        uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+        float w = weights[offsetWeights];
+        
+        tmp += outPrev * w;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void flSeq48ForwardHalf(
+    const device half4 * outsPrev,
+    const device half4 * weights,
+    const device half * biases,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        outsPrev && weights && biases && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint coeff = 8;
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem * coeff >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp[8] = {0};
+    for (uint depthPrev=0; depthPrev<nbNeuronsPrev/4; depthPrev++)
+    {
+        uint offsetWeights = (depthPrev * 4 + nbNeuronsPrev * depth) / 4;
+        half4 w = weights[offsetWeights];
+        
+        for (uint i=0; i<coeff; i++)
+        {
+            uint offsetPrev = (depthPrev * 4 + nbNeuronsPrev * seq +
+                sequence * nbNeuronsPrev * (elem*coeff+i)) / 4;
+            half4 outPrev = outsPrev[offsetPrev];
+            
+            tmp[i] += outPrev * w;
+        }
+    }
+    
+    float bias = biases[depth];
+    for (uint i=0; i<coeff; i++)
+    {
+        uint offset = depth + nbNeurons * seq +
+            sequence * nbNeurons * (elem*coeff+i);
+        outs[offset] = tmp[i][0] + tmp[i][1] + tmp[i][2] + tmp[i][3] + bias;
+    }
+}
+
+kernel void flSeq4ForwardHalf(
+    const device half4 * outsPrev,
+    const device half4 * weights,
+    const device half * biases,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        outsPrev && weights && biases && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0;
+    for (uint depthPrev=0; depthPrev<nbNeuronsPrev/4; depthPrev++)
+    {
+        uint offsetPrev = (depthPrev * 4 + nbNeuronsPrev * seq +
+            sequence * nbNeuronsPrev * elem) / 4;
+        half4 outPrev = outsPrev[offsetPrev];
+        
+        uint offsetWeights = (depthPrev * 4 + nbNeuronsPrev * depth) / 4;
+        half4 w = weights[offsetWeights];
+        
+        tmp += outPrev * w;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3] + biases[depth];
+}
+
+kernel void flSeqBackwardHalf(
+    const device half * delta,
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint dirty;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pDirty &&
+        deltaPrev && weights && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depthPrev >= nbNeuronsPrev || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+        float w = weights[offsetWeights];
+        
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        tmp += w * deltaCur;
+    }
+    
+    uint offsetPrev = depthPrev + nbNeuronsPrev * seq +
+        sequence * nbNeuronsPrev * elem;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += tmp;
+    }
+}
+
+kernel void flSeq48BackwardHalf(
+    const device half * delta,
+    const device half4 * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint dirty;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pDirty &&
+        deltaPrev && weights && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint coeff = 8;
+    uint depthPrev = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depthPrev * 4 >= nbNeuronsPrev ||
+        elem * coeff >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp[8] = {0};
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offsetWeights = (depthPrev * 4 + nbNeuronsPrev * depth) / 4;
+        half4 w = weights[offsetWeights];
+        
+        for (uint i=0; i<coeff; i++)
+        {
+            uint offset = depth + nbNeurons * seq +
+                sequence * nbNeurons * (elem*coeff+i);
+            float deltaCur = delta[offset];
+            
+            tmp[i] += w * deltaCur;
+        }
+    }
+    
+    if (dirty)
+    {
+        for (uint i=0; i<coeff; i++)
+        {
+            uint offsetPrev = (depthPrev * 4 + nbNeuronsPrev * seq +
+                sequence * nbNeuronsPrev * (elem*coeff+i)) / 4;
+            deltaPrev[offsetPrev] = tmp[i];
+        }
+    }
+    else
+    {
+        for (uint i=0; i<coeff; i++)
+        {
+            uint offsetPrev = (depthPrev * 4 + nbNeuronsPrev * seq +
+                sequence * nbNeuronsPrev * (elem*coeff+i)) / 4;
+            deltaPrev[offsetPrev] += tmp[i];
+        }
+    }
+}
+
+kernel void flSeq4BackwardHalf(
+    const device half * delta,
+    const device half4 * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint dirty;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pDirty &&
+        deltaPrev && weights && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depthPrev * 4 >= nbNeuronsPrev || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offsetWeights = (depthPrev * 4 + nbNeuronsPrev * depth) / 4;
+        half4 w = weights[offsetWeights];
+        
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        tmp += w * deltaCur;
+    }
+    
+    uint offsetPrev = (depthPrev * 4 + nbNeuronsPrev * seq +
+        sequence * nbNeuronsPrev * elem) / 4;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += tmp;
+    }
+}
+
+kernel void flSeqBatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint depthPrev = id[1];
+    
+    if (depth >= nbNeurons || depthPrev >= nbNeuronsPrev)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        uint offsetPrev = depthPrev + nbNeuronsPrev * seq +
+            sequence * nbNeuronsPrev * elem;
+        float outPrev = outsPrev[offsetPrev];
+        
+        tmp += outPrev * deltaCur;
+    }}
+    
+    uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
+
+kernel void flSeqBatch4DerWeightsHalf(
+    const device half4 * outsPrev,
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half4 * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence && pAccumulate &&
+        outsPrev && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint depthPrev = id[1];
+    
+    if (depth >= nbNeurons || depthPrev * 4 >= nbNeuronsPrev)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        uint offsetPrev = (depthPrev * 4 + nbNeuronsPrev * seq +
+            sequence * nbNeuronsPrev * elem) / 4;
+        half4 outPrev = outsPrev[offsetPrev];
+        
+        tmp += outPrev * deltaCur;
+    }}
+    
+    uint offsetWeights = (depthPrev * 4 + nbNeuronsPrev * depth) / 4;
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
+
+kernel void flSeqDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        outsPrev && delta && deltaWeights)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] % nbNeurons;
+    uint depthPrev = id[1];
+    uint elem = id[0] / nbNeurons;
+    
+    if (depth * elem >= nbNeurons * nbBatch ||
+        depthPrev >= nbNeuronsPrev)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        float deltaCur = delta[offset];
+        
+        uint offsetPrev = depthPrev + nbNeuronsPrev * seq +
+            sequence * nbNeuronsPrev * elem;
+        float outPrev = outsPrev[offsetPrev];
+        
+        tmp += outPrev * deltaCur;
+    }
+    
+    uint offsetStartWeights = elem * nbNeurons * nbNeuronsPrev;
+    uint offsetWeights = offsetStartWeights + depthPrev + nbNeuronsPrev * depth;
+    deltaWeights[offsetWeights] = tmp;
+}
+
+kernel void flSeqReduceWeightsHalf(
+    const device half * deltaWeights,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pAccumulate &&
+        deltaWeights && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+
+    uint depth = id[0];
+    uint depthPrev = id[1];
+    
+    if (depth >= nbNeurons || depthPrev >= nbNeuronsPrev)
+    {
+        return ;
+    }
+    
+    uint offsetWeights = depthPrev + nbNeuronsPrev * depth;
+        
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = elem * nbNeurons * nbNeuronsPrev + offsetWeights;
+        tmp += deltaWeights[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[offsetWeights] += tmp;
+    }
+    else
+    {
+        grads[offsetWeights] = tmp;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/InstanceNorm.metal b/Sources/GrAIdient/Metal/Kernel/InstanceNormFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/InstanceNorm.metal
rename to Sources/GrAIdient/Metal/Kernel/InstanceNormFloat.metal
index c5047d33..19f298cd 100644
--- a/Sources/GrAIdient/Metal/Kernel/InstanceNorm.metal
+++ b/Sources/GrAIdient/Metal/Kernel/InstanceNormFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void computeInstanceNormConvμ(
+kernel void computeInstanceNormConvμFloat(
     const device float * tmps,
     constant uint * pNbChannels,
     constant uint * pNbBatch,
@@ -53,7 +53,7 @@ kernel void computeInstanceNormConvμ(
     μ[depth + nbChannels * elem] = sum / nbElems;
 }
 
-kernel void computeInstanceNormConvσ2(
+kernel void computeInstanceNormConvσ2Float(
     const device float * tmps,
     const device float * μ,
     constant uint * pNbChannels,
@@ -100,7 +100,7 @@ kernel void computeInstanceNormConvσ2(
     σ2[depth + nbChannels * elem] = sum / nbElems;
 }
 
-kernel void forwardInstanceNormConv(
+kernel void forwardInstanceNormConvFloat(
     const device float * β,
     const device float * Ɣ,
     const device float * μ,
@@ -150,7 +150,7 @@ kernel void forwardInstanceNormConv(
     tmps[offset] = Ɣ[depth] * xhat + β[depth];
 }
 
-kernel void forwardAdaIN(
+kernel void forwardAdaINFloat(
     const device float * outsPrev,
     const device float * styles,
     const device float * μ,
@@ -200,7 +200,7 @@ kernel void forwardAdaIN(
     outs[offset] = styles[depth] * xhat + styles[depth + nbChannels];
 }
 
-kernel void backwardWeightsInstanceNormConv(
+kernel void backwardWeightsInstanceNormConvFloat(
     const device float * delta,
     const device float * xHat,
     const device float * Ɣ,
@@ -274,7 +274,7 @@ kernel void backwardWeightsInstanceNormConv(
     }
 }
 
-kernel void backward2AdaIN(
+kernel void backward2AdaINFloat(
     const device float * delta,
     const device float * xHat,
     const device float * outStyles,
@@ -347,7 +347,7 @@ kernel void backward2AdaIN(
     }
 }
 
-kernel void backwardInstanceNormConv(
+kernel void backwardInstanceNormConvFloat(
     const device float * σ2,
     const device float * xHat,
     const device float * Ɣ,
@@ -401,7 +401,7 @@ kernel void backwardInstanceNormConv(
     delta[offset] = mult * (tmp1 - tmp2 - tmp3);
 }
 
-kernel void backward1AdaIN(
+kernel void backward1AdaINFloat(
     const device float * delta,
     const device float * σ2,
     const device float * xHat,
diff --git a/Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal b/Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal
new file mode 100644
index 00000000..6a797f7d
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal
@@ -0,0 +1,467 @@
+//
+// InstanceNorm.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 17/02/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void computeInstanceNormConvμHalf(
+    const device half * tmps,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    device half * μ,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    
+    if (pNbChannels && pNbBatch && pDimensions && tmps && μ)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    if (depth >= nbChannels || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint nbElems = width * height;
+    float sum = 0.0;
+    
+    for (uint x=0; x<width; x++){
+    for (uint y=0; y<height; y++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        uint offset = y + (offsetStart + x) * width;
+            
+        sum += tmps[offset];
+    }}
+    
+    μ[depth + nbChannels * elem] = sum / nbElems;
+}
+
+kernel void computeInstanceNormConvσ2Half(
+    const device half * tmps,
+    const device half * μ,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    device half * σ2,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    
+    if (pNbChannels && pNbBatch && pDimensions && tmps && μ && σ2)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    if (depth >= nbChannels || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint nbElems = width * height;
+    float sum = 0.0;
+    
+    for (uint x=0; x<width; x++){
+    for (uint y=0; y<height; y++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        uint offset = y + (offsetStart + x) * width;
+            
+        float tmp = tmps[offset] - μ[depth + nbChannels * elem];
+        sum += tmp * tmp;
+    }}
+    
+    σ2[depth + nbChannels * elem] = sum / nbElems;
+}
+
+kernel void forwardInstanceNormConvHalf(
+    const device half * β,
+    const device half * Ɣ,
+    const device half * μ,
+    const device half * σ2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    device half * tmps,
+    device half * xHat,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pDimensions && β && Ɣ &&
+        tmps && xHat && μ && σ2)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float tmp1 = tmps[offset] - μ[depth + nbChannels * elem];
+    float tmp2 = sqrt(σ2[depth + nbChannels * elem] + Ɛ);
+    float xhat = tmp1 / tmp2;
+    xHat[offset] = xhat;
+    tmps[offset] = Ɣ[depth] * xhat + β[depth];
+}
+
+kernel void forwardAdaINHalf(
+    const device half * outsPrev,
+    const device half * styles,
+    const device half * μ,
+    const device half * σ2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    device half * outs,
+    device half * xHat,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pDimensions && outsPrev && styles &&
+        outs && xHat && μ && σ2)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float tmp1 = outsPrev[offset] - μ[depth + nbChannels * elem];
+    float tmp2 = sqrt(σ2[depth + nbChannels * elem] + Ɛ);
+    float xhat = tmp1 / tmp2;
+    xHat[offset] = xhat;
+    outs[offset] = styles[depth] * xhat + styles[depth + nbChannels];
+}
+
+kernel void backwardWeightsInstanceNormConvHalf(
+    const device half * delta,
+    const device half * xHat,
+    const device half * Ɣ,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    constant uint * pAccumulate,
+    device half * sum1,
+    device half * sum2,
+    device half * dƔ,
+    device half * dβ,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    uint accumulate;
+    
+    if (pNbChannels && pNbBatch && pDimensions && pAccumulate &&
+        delta && xHat && Ɣ &&
+        sum1 && sum2 && dƔ && dβ)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbChannels)
+    {
+        return ;
+    }
+    
+    float tmp3 = 0.0, tmp4 = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        float tmp1 = 0.0, tmp2 = 0.0;
+        for (uint x=0; x<width; x++){
+        for (uint y=0; y<height; y++)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = y + (offsetStart + x) * width;
+            
+            float deltaTmp = delta[offset];
+            float xHatTmp = xHat[offset];
+            float dxHat = Ɣ[depth] * deltaTmp;
+            tmp1 += dxHat;
+            tmp2 += dxHat * xHatTmp;
+            tmp3 += deltaTmp * xHatTmp;
+            tmp4 += deltaTmp;
+        }}
+        
+        sum1[depth + nbChannels * elem] = tmp1;
+        sum2[depth + nbChannels * elem] = tmp2;
+    }
+    
+    if (accumulate)
+    {
+        dƔ[depth] += tmp3;
+        dβ[depth] += tmp4;
+    }
+    else
+    {
+        dƔ[depth] = tmp3;
+        dβ[depth] = tmp4;
+    }
+}
+
+kernel void backward2AdaINHalf(
+    const device half * delta,
+    const device half * xHat,
+    const device half * outStyles,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    constant uint * pDirty,
+    device half * sum1,
+    device half * sum2,
+    device half * deltaStyles,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    uint dirty;
+    
+    if (pNbChannels && pNbBatch && pDimensions && pDirty &&
+        delta && xHat && outStyles &&
+        sum1 && sum2 && deltaStyles)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    if (depth >= nbChannels || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp1 = 0.0, tmp2 = 0.0;
+    float tmp3 = 0.0, tmp4 = 0.0;
+    
+    for (uint x=0; x<width; x++){
+    for (uint y=0; y<height; y++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        uint offset = y + (offsetStart + x) * width;
+        
+        float deltaTmp = delta[offset];
+        float xHatTmp = xHat[offset];
+        float dxHat = outStyles[depth] * deltaTmp;
+        tmp1 += dxHat;
+        tmp2 += dxHat * xHatTmp;
+        tmp3 += deltaTmp * xHatTmp;
+        tmp4 += deltaTmp;
+    }}
+        
+    sum1[depth + nbChannels * elem] = tmp1;
+    sum2[depth + nbChannels * elem] = tmp2;
+    
+    uint offset = (2 * nbChannels) * elem;
+    if (dirty)
+    {
+        deltaStyles[depth + offset] = tmp3;
+        deltaStyles[depth + nbChannels + offset] = tmp4;
+    }
+    else
+    {
+        deltaStyles[depth + offset] += tmp3;
+        deltaStyles[depth + nbChannels + offset] += tmp4;
+    }
+}
+
+kernel void backwardInstanceNormConvHalf(
+    const device half * σ2,
+    const device half * xHat,
+    const device half * Ɣ,
+    const device half * sum1,
+    const device half * sum2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    device half * delta,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pDimensions &&
+        σ2 && xHat && Ɣ && sum1 && sum2 && delta)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    uint nbElems = width * height;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float mult =
+        1.0 / ((float)nbElems * sqrt(σ2[depth + nbChannels * elem] + Ɛ));
+    float dxHat = Ɣ[depth] * delta[offset];
+    float tmp1 = nbElems * dxHat;
+    float tmp2 = sum1[depth + nbChannels * elem];
+    float tmp3 = xHat[offset] * sum2[depth + nbChannels * elem];
+    
+    delta[offset] = mult * (tmp1 - tmp2 - tmp3);
+}
+
+kernel void backward1AdaINHalf(
+    const device half * delta,
+    const device half * σ2,
+    const device half * xHat,
+    const device half * styles,
+    const device half * sum1,
+    const device half * sum2,
+    constant uint * pNbChannels,
+    constant uint * pNbBatch,
+    constant uint * pDimensions,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint nbBatch;
+    uint width;
+    uint height;
+    uint dirty;
+    float Ɛ = 1e-5;
+    
+    if (pNbChannels && pNbBatch && pDimensions && pDirty &&
+        delta && σ2 && xHat && styles && sum1 && sum2 && deltaPrev)
+    {
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    uint nbElems = width * height;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float mult =
+        1.0 / ((float)nbElems * sqrt(σ2[depth + nbChannels * elem] + Ɛ));
+    float dxHat = styles[depth] * delta[offset];
+    float tmp1 = nbElems * dxHat;
+    float tmp2 = sum1[depth + nbChannels * elem];
+    float tmp3 = xHat[offset] * sum2[depth + nbChannels * elem];
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = mult * (tmp1 - tmp2 - tmp3);
+    }
+    else
+    {
+        deltaPrev[offset] += mult * (tmp1 - tmp2 - tmp3);
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Layer1D.metal b/Sources/GrAIdient/Metal/Kernel/Layer1DFloat.metal
similarity index 96%
rename from Sources/GrAIdient/Metal/Kernel/Layer1D.metal
rename to Sources/GrAIdient/Metal/Kernel/Layer1DFloat.metal
index e5137942..bac32006 100644
--- a/Sources/GrAIdient/Metal/Kernel/Layer1D.metal
+++ b/Sources/GrAIdient/Metal/Kernel/Layer1DFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void MSE1DLoss(
+kernel void MSE1DLossFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbNeurons,
@@ -48,7 +48,7 @@ kernel void MSE1DLoss(
     losses[elem] = tmp;
 }
 
-kernel void MSE1DLossDerivative(
+kernel void MSE1DLossDerivativeFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbNeurons,
@@ -98,7 +98,7 @@ kernel void MSE1DLossDerivative(
     }
 }
 
-kernel void linearErrorLoss(
+kernel void linearErrorLossFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbNeurons,
@@ -138,7 +138,7 @@ kernel void linearErrorLoss(
     losses[elem] = tmp;
 }
 
-kernel void linearErrorLossDerivative(
+kernel void linearErrorLossDerivativeFloat(
     const device float * outs,
     constant uint * pNbNeurons,
     constant float * pCoeff,
@@ -182,7 +182,7 @@ kernel void linearErrorLossDerivative(
     }
 }
 
-kernel void selectNeurons1DForward(
+kernel void selectNeurons1DForwardFloat(
     const device float * outsPrev,
     constant uint * pNbNeurons,
     constant uint * pNbNeuronsPrev,
@@ -219,7 +219,7 @@ kernel void selectNeurons1DForward(
     outs[offset] = pCoeffs[depth] * outsPrev[offsetPrev];
 }
 
-kernel void selectNeurons1DBackward(
+kernel void selectNeurons1DBackwardFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pNbNeuronsPrev,
@@ -256,7 +256,7 @@ kernel void selectNeurons1DBackward(
     deltaPrev[offsetPrev] += pCoeffs[depth] * delta[offset];
 }
 
-kernel void concat1DForward(
+kernel void concat1DForwardFloat(
     const device float * outsPrev,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -295,7 +295,7 @@ kernel void concat1DForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void concat1DBackward(
+kernel void concat1DBackwardFloat(
     const device float * delta,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -344,7 +344,7 @@ kernel void concat1DBackward(
     }
 }
 
-kernel void softmax1DForward(
+kernel void softmax1DForwardFloat(
     const device float * outsPrev,
     constant uint * pNbHeads,
     constant uint * pNbNeurons,
@@ -401,7 +401,7 @@ kernel void softmax1DForward(
     outs[offset] = exp(outPrev - cMax) / sum1;
 }
 
-kernel void softmax1DBackward(
+kernel void softmax1DBackwardFloat(
     const device float * outs,
     const device float * delta,
     constant uint * pNbHeads,
@@ -461,7 +461,7 @@ kernel void softmax1DBackward(
     }
 }
 
-kernel void dotProduct1DForward(
+kernel void dotProduct1DForwardFloat(
     const device float * outsPrev1,
     const device float * outsPrev2,
     constant int * pSize,
@@ -508,7 +508,7 @@ kernel void dotProduct1DForward(
     outs[offset] = sum;
 }
 
-kernel void dotProduct1DBackward(
+kernel void dotProduct1DBackwardFloat(
     const device float * outsPrev,
     const device float * delta,
     constant int * pSize,
@@ -563,7 +563,7 @@ kernel void dotProduct1DBackward(
     }
 }
 
-kernel void constant1DForward(
+kernel void constant1DForwardFloat(
     const device float * weights,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -593,7 +593,7 @@ kernel void constant1DForward(
     outs[offset] = weights[depth];
 }
 
-kernel void BCE1DLoss(
+kernel void BCE1DLossFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbNeurons,
@@ -634,7 +634,7 @@ kernel void BCE1DLoss(
     losses[elem] = tmp;
 }
 
-kernel void BCE1DLossDerivative(
+kernel void BCE1DLossDerivativeFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbNeurons,
@@ -693,7 +693,7 @@ kernel void BCE1DLossDerivative(
     }
 }
 
-kernel void BCESigmoid1DLoss(
+kernel void BCESigmoid1DLossFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbNeurons,
@@ -744,7 +744,7 @@ kernel void BCESigmoid1DLoss(
     losses[elem] = tmp;
 }
 
-kernel void BCESigmoid1DLossDerivative(
+kernel void BCESigmoid1DLossDerivativeFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbNeurons,
@@ -803,7 +803,7 @@ kernel void BCESigmoid1DLossDerivative(
     }
 }
 
-kernel void dropout1DForward(
+kernel void dropout1DForwardFloat(
     const device float * outsPrev,
     const device bool * dropout,
     constant uint * pNbNeurons,
@@ -852,7 +852,7 @@ kernel void dropout1DForward(
     }
 }
 
-kernel void dropout1DBackward(
+kernel void dropout1DBackwardFloat(
     const device float * delta,
     const device bool * dropout,
     constant uint * pNbNeurons,
diff --git a/Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal b/Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal
new file mode 100644
index 00000000..ce473260
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal
@@ -0,0 +1,915 @@
+//
+// Layer1D.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void MSE1DLossHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbBatch && outs && groundTruth && losses)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth + nbNeurons * elem;
+    
+        float gt = groundTruth[offset];
+        float out = outs[offset];
+        float diff = out - gt;
+        
+        tmp += diff * diff;
+    }
+    
+    losses[elem] = tmp;
+}
+
+kernel void MSE1DLossDerivativeHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbNeurons,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbNeurons && pNbBatch && pCoeff && pDirty &&
+        outs && groundTruth && deltaPrev)
+    {
+        nbNeurons = *pNbNeurons;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+
+    float gt = groundTruth[offset];
+    float out = outs[offset];
+    float diff = out - gt;
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = 2 * coeff * diff / float(nbNeurons * nbBatch);
+    }
+    else
+    {
+        deltaPrev[offset] += 2 * coeff * diff / float(nbNeurons * nbBatch);
+    }
+}
+
+kernel void linearErrorLossHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbBatch && outs && groundTruth && losses)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth + nbNeurons * elem;
+    
+        float gt = groundTruth[offset];
+        float out = outs[offset];
+        float diff = out - gt;
+        
+        tmp += diff;
+    }
+    
+    losses[elem] = tmp;
+}
+
+kernel void linearErrorLossDerivativeHalf(
+    const device half * outs,
+    constant uint * pNbNeurons,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbNeurons && pNbBatch && pCoeff && pDirty && outs && deltaPrev)
+    {
+        nbNeurons = *pNbNeurons;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = coeff / float(nbNeurons * nbBatch);
+    }
+    else
+    {
+        deltaPrev[offset] += coeff / float(nbNeurons * nbBatch);
+    }
+}
+
+kernel void selectNeurons1DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNeurons,
+    constant float * pCoeffs,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNeurons && pCoeffs && pNbBatch &&
+        outsPrev && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    uint offsetPrev = pNeurons[depth] + nbNeuronsPrev * elem;
+    outs[offset] = pCoeffs[depth] * outsPrev[offsetPrev];
+}
+
+kernel void selectNeurons1DBackwardHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNeurons,
+    constant float * pCoeffs,
+    constant uint * pNbBatch,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbNeuronsPrev && pNeurons && pCoeffs && pNbBatch &&
+        deltaPrev && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    uint offsetPrev = pNeurons[depth] + nbNeuronsPrev * elem;
+    deltaPrev[offsetPrev] += pCoeffs[depth] * delta[offset];
+}
+
+kernel void concat1DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint globalOffset;
+    
+    if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev && pNbBatch &&
+        outsPrev && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        globalOffset = *pGlobalOffset;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeuronsPrev || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = depth + nbNeuronsPrev * elem;
+    uint offset = globalOffset+depth + nbNeurons * elem;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void concat1DBackwardHalf(
+    const device half * delta,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint globalOffset;
+    uint dirty;
+    
+    if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev && pNbBatch && pDirty &&
+        deltaPrev && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        globalOffset = *pGlobalOffset;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeuronsPrev || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = depth + nbNeuronsPrev * elem;
+    uint offset = globalOffset+depth + nbNeurons * elem;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void softmax1DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint size;
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbHeads && pNbNeurons && pNbBatch && outsPrev && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        size = nbNeurons / nbHeads;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    uint head = depth / size;
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float cMax = outsPrev[0+head*size + nbNeurons * elem];
+    for (uint j=0; j<size; j++)
+    {
+        uint offset1 = j+head*size + nbNeurons * elem;
+        float outPrev = outsPrev[offset1];
+        
+        if (outPrev > cMax)
+        {
+            cMax = outPrev;
+        }
+    }
+    
+    float sum1 = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint offset1 = j+head*size + nbNeurons * elem;
+        float outPrev = outsPrev[offset1];
+        sum1 += exp(outPrev - cMax);
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    float outPrev = outsPrev[offset];
+    outs[offset] = exp(outPrev - cMax) / sum1;
+}
+
+kernel void softmax1DBackwardHalf(
+    const device half * outs,
+    const device half * delta,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint size;
+    uint nbNeurons;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbBatch && pDirty &&
+        deltaPrev && outs && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+        size = nbNeurons / nbHeads;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    uint head = depth / size;
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    float outCur = outs[offset];
+    float deltaCur = delta[offset];
+    
+    float sum1 = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint offset1 = j+head*size + nbNeurons * elem;
+        float outCur1 = outs[offset1];
+        float deltaCur1 = delta[offset1];
+        sum1 += outCur1 * deltaCur1;
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = outCur * (deltaCur - sum1);
+    }
+    else
+    {
+        deltaPrev[offset] += outCur * (deltaCur - sum1);
+    }
+}
+
+kernel void dotProduct1DForwardHalf(
+    const device half * outsPrev1,
+    const device half * outsPrev2,
+    constant int * pSize,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint size;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    
+    if (pSize && pNbNeurons && pNbNeuronsPrev && pNbBatch &&
+        outsPrev1 && outsPrev2 && outs)
+    {
+        size = *pSize;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint offset = j+depth*size + nbNeuronsPrev * elem;
+        float outPrev1 = outsPrev1[offset];
+        float outPrev2 = outsPrev2[offset];
+        sum += outPrev1 * outPrev2;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    outs[offset] = sum;
+}
+
+kernel void dotProduct1DBackwardHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant int * pSize,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint size;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pSize && pNbNeurons && pNbNeuronsPrev && pNbBatch && pDirty &&
+        outsPrev && deltaPrev && delta)
+    {
+        size = *pSize;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    for (uint j=0; j<size; j++)
+    {
+        uint offsetPrev = j+depth*size + nbNeuronsPrev * elem;
+        uint offset = depth + nbNeurons * elem;
+        
+        float outPrev = outsPrev[offsetPrev];
+        float deltaCur = delta[offset];
+        if (dirty)
+        {
+            deltaPrev[offsetPrev] = outPrev * deltaCur;
+        }
+        else
+        {
+            deltaPrev[offsetPrev] += outPrev * deltaCur;
+        }
+    }
+}
+
+kernel void constant1DForwardHalf(
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbBatch && weights && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    outs[offset] = weights[depth];
+}
+
+kernel void BCE1DLossHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbBatch && outs && groundTruth && losses)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth + nbNeurons * elem;
+    
+        float gt = groundTruth[offset];
+        float out = outs[offset];
+        float tmp1 = log(out);
+        float tmp2 = log(1 - out);
+        
+        tmp -= (gt * tmp1 + (1 - gt) * tmp2);
+    }
+    
+    losses[elem] = tmp;
+}
+
+kernel void BCE1DLossDerivativeHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbNeurons,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbNeurons && pNbBatch && pCoeff && pDirty &&
+        outs && groundTruth && deltaPrev)
+    {
+        nbNeurons = *pNbNeurons;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+
+    float gt = groundTruth[offset];
+    float out = outs[offset];
+    float derivative = 0.0;
+    
+    if (gt == 1.0)
+    {
+        derivative = -1 / out;
+    }
+    else if (gt == 0.0)
+    {
+        derivative = 1 / (1 - out);
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = coeff * derivative / float(nbNeurons * nbBatch);
+    }
+    else
+    {
+        deltaPrev[offset] += coeff * derivative / float(nbNeurons * nbBatch);
+    }
+}
+
+kernel void BCESigmoid1DLossHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbNeurons && pNbBatch && outs && groundTruth && losses)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth + nbNeurons * elem;
+    
+        float gt = groundTruth[offset];
+        float out = outs[offset];
+        float value;
+        
+        if (out > 0)
+        {
+            value = (1 - gt) * out;
+            value += log(1 + exp(-out));
+        }
+        else
+        {
+            value = -out * gt;
+            value += log(exp(out) + 1);
+        }
+        
+        tmp += value;
+    }
+    
+    losses[elem] = tmp;
+}
+
+kernel void BCESigmoid1DLossDerivativeHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbNeurons,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbNeurons && pNbBatch && pCoeff && pDirty &&
+        outs && groundTruth && deltaPrev)
+    {
+        nbNeurons = *pNbNeurons;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+
+    float gt = groundTruth[offset];
+    float out = outs[offset];
+    float value;
+    
+    if (out >= 0)
+    {
+        value = 1.0 / (1.0 + exp(-out));
+    }
+    else
+    {
+        value = exp(out) / (1.0 + exp(out));
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = coeff * (value - gt) / float(nbNeurons * nbBatch);
+    }
+    else
+    {
+        deltaPrev[offset] += coeff * (value - gt) / float(nbNeurons * nbBatch);
+    }
+}
+
+kernel void dropout1DForwardHalf(
+    const device half * outsPrev,
+    const device bool * dropout,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant bool * pApplyDropout,
+    constant float * pCoeff,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    bool applyDropout;
+    float coeff;
+    
+    if (pNbNeurons && pNbBatch && pApplyDropout && pCoeff &&
+        dropout && outsPrev && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        applyDropout = *pApplyDropout;
+        coeff = *pCoeff;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    if (applyDropout && !dropout[offset])
+    {
+        outs[offset] = 1.0 / (1.0 - coeff) * outsPrev[offset];
+    }
+    else if (applyDropout)
+    {
+        outs[offset] = 0.0;
+    }
+    else
+    {
+        outs[offset] = outsPrev[offset];
+    }
+}
+
+kernel void dropout1DBackwardHalf(
+    const device half * delta,
+    const device bool * dropout,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant bool * pApplyDropout,
+    constant float * pCoeff,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    bool applyDropout;
+    float coeff;
+    uint dirty;
+    
+    if (pNbNeurons && pNbBatch && pApplyDropout && pCoeff &&
+        dropout && delta && deltaPrev)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        applyDropout = *pApplyDropout;
+        coeff = *pCoeff;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float newValue = 0.0;
+    uint offset = depth + nbNeurons * elem;
+    if (applyDropout && !dropout[offset])
+    {
+        newValue = 1.0 / (1.0 - coeff) * delta[offset];
+    }
+    else if (applyDropout)
+    {
+        newValue = 0.0;
+    }
+    else
+    {
+        newValue = delta[offset];
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = newValue;
+    }
+    else
+    {
+        deltaPrev[offset] += newValue;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Layer2D.metal b/Sources/GrAIdient/Metal/Kernel/Layer2DFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/Layer2D.metal
rename to Sources/GrAIdient/Metal/Kernel/Layer2DFloat.metal
index 818f528b..72ca39f1 100644
--- a/Sources/GrAIdient/Metal/Kernel/Layer2D.metal
+++ b/Sources/GrAIdient/Metal/Kernel/Layer2DFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void avgPoolForward(
+kernel void avgPoolForwardFloat(
     const device float * outsPrev,
     constant uint * pNbNeurons,
     constant uint * pDimensionsPrev,
@@ -54,7 +54,7 @@ kernel void avgPoolForward(
     outs[offset] = tmp;
 }
 
-kernel void avgPoolBackward(
+kernel void avgPoolBackwardFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pDimensionsPrev,
@@ -107,7 +107,7 @@ kernel void avgPoolBackward(
     }
 }
 
-kernel void maxPoolForward(
+kernel void maxPoolForwardFloat(
     const device float * outsPrev,
     constant int * pStart,
     constant uint * pStride,
@@ -184,7 +184,7 @@ kernel void maxPoolForward(
     indicesMax[offset] = indexMax;
 }
 
-kernel void maxPoolBackward(
+kernel void maxPoolBackwardFloat(
     const device float * delta,
     const device int * indicesMax,
     constant int * pStart,
@@ -291,7 +291,7 @@ uint _endIndex(uint index, uint smallSize, uint bigSize)
     return (uint)(ceil(float((index + 1) * bigSize) / smallSize));
 }
 
-kernel void adaptiveAvgPoolForward1(
+kernel void adaptiveAvgPoolForward1Float(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -353,7 +353,7 @@ kernel void adaptiveAvgPoolForward1(
     outs[offset] = tmp / (float)nbElems;
 }
 
-kernel void adaptiveAvgPoolForward2(
+kernel void adaptiveAvgPoolForward2Float(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -424,7 +424,7 @@ kernel void adaptiveAvgPoolForward2(
     }}
 }
 
-kernel void adaptiveAvgPoolBackward1(
+kernel void adaptiveAvgPoolBackward1Float(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -487,7 +487,7 @@ kernel void adaptiveAvgPoolBackward1(
     }}
 }
 
-kernel void adaptiveAvgPoolBackward2(
+kernel void adaptiveAvgPoolBackward2Float(
     const device float * delta,
     const device int * nbElems,
     constant uint * pNbChannels,
@@ -548,7 +548,7 @@ kernel void adaptiveAvgPoolBackward2(
     }}
 }
 
-kernel void selectNeurons2DForward(
+kernel void selectNeurons2DForwardFloat(
     const device float * outsPrev,
     constant uint * pTarget,
     constant uint * pNbNeurons,
@@ -591,7 +591,7 @@ kernel void selectNeurons2DForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void selectNeurons2DBackward(
+kernel void selectNeurons2DBackwardFloat(
     const device float * delta,
     constant uint * pTarget,
     constant uint * pNbNeurons,
@@ -652,7 +652,7 @@ kernel void selectNeurons2DBackward(
     }
 }
 
-kernel void IRDFT2RGBForward(
+kernel void IRDFT2RGBForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -707,7 +707,7 @@ kernel void IRDFT2RGBForward(
     outs[offset] = sum;
 }
 
-kernel void IRDFT2RGBBackward(
+kernel void IRDFT2RGBBackwardFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -779,7 +779,7 @@ kernel void IRDFT2RGBBackward(
     }
 }
 
-kernel void decorrelateRGBForward(
+kernel void decorrelateRGBForwardFloat(
     const device float * outsPrev,
     constant float * correlation,
     constant uint * pNbChannels,
@@ -831,7 +831,7 @@ kernel void decorrelateRGBForward(
     outs[offset] = sum;
 }
 
-kernel void decorrelateRGBBackward(
+kernel void decorrelateRGBBackwardFloat(
     const device float * delta,
     constant float * correlation,
     constant uint * pNbChannels,
@@ -894,7 +894,7 @@ kernel void decorrelateRGBBackward(
     }
 }
 
-kernel void linearScale2DForward(
+kernel void linearScale2DForwardFloat(
     const device float * outsPrev,
     constant float * weights,
     constant uint * pNbChannels,
@@ -935,7 +935,7 @@ kernel void linearScale2DForward(
     outs[offset] = weights[0] * outsPrev[offset] + weights[1];
 }
 
-kernel void linearScale2DBackward(
+kernel void linearScale2DBackwardFloat(
     const device float * delta,
     constant float * weights,
     constant uint * pNbChannels,
@@ -996,7 +996,7 @@ float _getScaleValue(
     return (1.0 / freq) * float(dimension);
 }
 
-kernel void setDataFTFrequences2D(
+kernel void setDataFTFrequences2DFloat(
     constant uint * pNbChannels,
     constant uint * pDimension,
     constant uint * pNbBatch,
@@ -1063,7 +1063,7 @@ kernel void setDataFTFrequences2D(
     outs[offset] = _getScaleValue(iTmp, jTmp, dimension);
 }
 
-kernel void pad2DForward(
+kernel void pad2DForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1124,7 +1124,7 @@ kernel void pad2DForward(
     }
 }
 
-kernel void pad2DBackward(
+kernel void pad2DBackwardFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1184,7 +1184,7 @@ kernel void pad2DBackward(
     }
 }
 
-kernel void crop2DForward(
+kernel void crop2DForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1238,7 +1238,7 @@ kernel void crop2DForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void crop2DBackward(
+kernel void crop2DBackwardFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1312,7 +1312,7 @@ kernel void crop2DBackward(
     }
 }
 
-kernel void resizeBilinearPadForward(
+kernel void resizeBilinearPadForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1406,7 +1406,7 @@ kernel void resizeBilinearPadForward(
     }
 }
 
-kernel void resizeBilinearPadBackward(
+kernel void resizeBilinearPadBackwardFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1526,7 +1526,7 @@ kernel void resizeBilinearPadBackward(
     }}
 }
 
-kernel void rotate2DForward(
+kernel void rotate2DForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1590,7 +1590,7 @@ kernel void rotate2DForward(
     }
 }
 
-kernel void rotate2DBackward(
+kernel void rotate2DBackwardFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1659,7 +1659,7 @@ kernel void rotate2DBackward(
     }}
 }
 
-kernel void resizeBilinearCropForward(
+kernel void resizeBilinearCropForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1740,7 +1740,7 @@ kernel void resizeBilinearCropForward(
     outs[offset] = out;
 }
 
-kernel void resizeBilinearCropBackward(
+kernel void resizeBilinearCropBackwardFloat(
     const device float * delta,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -1861,7 +1861,7 @@ kernel void resizeBilinearCropBackward(
     }}
 }
 
-kernel void concat02DForward(
+kernel void concat02DForwardFloat(
     const device float * outsPrev,
     constant uint * pGlobalOffset,
     constant uint * pNbChannels,
@@ -1907,7 +1907,7 @@ kernel void concat02DForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void concat02DBackward(
+kernel void concat02DBackwardFloat(
     const device float * delta,
     constant uint * pGlobalOffset,
     constant uint * pNbChannels,
@@ -1963,7 +1963,7 @@ kernel void concat02DBackward(
     }
 }
 
-kernel void concat12DForward(
+kernel void concat12DForwardFloat(
     const device float * outsPrev,
     constant uint * pGlobalOffset,
     constant uint * pNbChannels,
@@ -2012,7 +2012,7 @@ kernel void concat12DForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void concat12DBackward(
+kernel void concat12DBackwardFloat(
     const device float * delta,
     constant uint * pGlobalOffset,
     constant uint * pNbChannels,
@@ -2071,7 +2071,7 @@ kernel void concat12DBackward(
     }
 }
 
-kernel void constant2DForward(
+kernel void constant2DForwardFloat(
     const device float * weights,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -2110,7 +2110,7 @@ kernel void constant2DForward(
     outs[offset] = weights[depth];
 }
 
-kernel void MSE2DLoss(
+kernel void MSE2DLossFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbChannels,
@@ -2160,7 +2160,7 @@ kernel void MSE2DLoss(
     losses[elem] = tmp;
 }
 
-kernel void MSE2DLossDerivative(
+kernel void MSE2DLossDerivativeFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbChannels,
@@ -2220,7 +2220,7 @@ kernel void MSE2DLossDerivative(
     }
 }
 
-kernel void selfCorrelate2DForward(
+kernel void selfCorrelate2DForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannelsPrev,
     constant uint * pDimensionsPrev,
@@ -2271,7 +2271,7 @@ kernel void selfCorrelate2DForward(
     outs[offset] = correlation;
 }
 
-kernel void selfCorrelate2DBackward(
+kernel void selfCorrelate2DBackwardFloat(
     const device float * delta,
     const device float * outsPrev,
     constant uint * pNbChannelsPrev,
@@ -2342,7 +2342,7 @@ kernel void selfCorrelate2DBackward(
     }
 }
 
-kernel void normalize12DForward(
+kernel void normalize12DForwardFloat(
     const device float * outsPrev,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -2394,7 +2394,7 @@ kernel void normalize12DForward(
     outs[offset] = outPrev / max(norm, 1e-12);
 }
 
-kernel void normalize12DBackward(
+kernel void normalize12DBackwardFloat(
     const device float * delta,
     const device float * outsPrev,
     constant uint * pNbChannels,
@@ -2480,7 +2480,7 @@ kernel void normalize12DBackward(
     }
 }
 
-kernel void computeSquaredNorm122D(
+kernel void computeSquaredNorm122DFloat(
      const device float * outsPrev,
      constant uint * pNbChannels,
      constant uint * pDimensions,
@@ -2549,7 +2549,7 @@ kernel void computeSquaredNorm122D(
     }
 }
 
-kernel void normalize122DForward(
+kernel void normalize122DForwardFloat(
     const device float * outsPrev,
     const device float * squaredNorms,
     constant uint * pNbChannels,
@@ -2596,7 +2596,7 @@ kernel void normalize122DForward(
     outs[offset] = outPrev / max(norm, 1e-12);
 }
 
-kernel void computeDeltaTmp122D(
+kernel void computeDeltaTmp122DFloat(
      const device float * delta,
      const device float * outsPrev,
      const device float * squaredNorms,
@@ -2673,7 +2673,7 @@ kernel void computeDeltaTmp122D(
     }
 }
 
-kernel void normalize122DBackward(
+kernel void normalize122DBackwardFloat(
     const device float * delta,
     const device float * outsPrev,
     const device float * squaredNorms,
@@ -2746,7 +2746,7 @@ kernel void normalize122DBackward(
     }
 }
 
-kernel void similarBatchError2DLoss(
+kernel void similarBatchError2DLossFloat(
     const device float * outs,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -2795,7 +2795,7 @@ kernel void similarBatchError2DLoss(
     }
 }
 
-kernel void similarBatchError2DLossDerivative(
+kernel void similarBatchError2DLossDerivativeFloat(
     const device float * outs,
     constant uint * pNbChannels,
     constant uint * pDimensions,
@@ -2856,7 +2856,7 @@ kernel void similarBatchError2DLossDerivative(
     }
 }
 
-kernel void similarError2DLossDerivative(
+kernel void similarError2DLossDerivativeFloat(
     const device float * outs,
     constant uint * pGlobalOffset,
     constant uint * pNbChannels,
@@ -2923,7 +2923,7 @@ kernel void similarError2DLossDerivative(
     }
 }
 
-kernel void flipHorizontal2DForward(
+kernel void flipHorizontal2DForwardFloat(
     const device float * outsPrev,
     constant uint * pDoFlip,
     constant uint * pNbChannels,
@@ -2971,7 +2971,7 @@ kernel void flipHorizontal2DForward(
     outs[offset1] = outsPrev[offset2];
 }
 
-kernel void flipHorizontal2DBackward(
+kernel void flipHorizontal2DBackwardFloat(
     const device float * delta,
     constant uint * pDoFlip,
     constant uint * pNbChannels,
@@ -3029,7 +3029,7 @@ kernel void flipHorizontal2DBackward(
     }
 }
 
-kernel void flipVertical2DForward(
+kernel void flipVertical2DForwardFloat(
     const device float * outsPrev,
     constant uint * pDoFlip,
     constant uint * pNbChannels,
@@ -3077,7 +3077,7 @@ kernel void flipVertical2DForward(
     outs[offset1] = outsPrev[offset2];
 }
 
-kernel void flipVertical2DBackward(
+kernel void flipVertical2DBackwardFloat(
     const device float * delta,
     constant uint * pDoFlip,
     constant uint * pNbChannels,
@@ -3135,7 +3135,7 @@ kernel void flipVertical2DBackward(
     }
 }
 
-kernel void colorJitterHSVForward(
+kernel void colorJitterHSVForwardFloat(
     const device float * outsPrev,
     constant float * pNoise,
     constant uint * pDimensions,
@@ -3260,7 +3260,7 @@ kernel void colorJitterHSVForward(
     outs[offsetB] = b;
 }
 
-kernel void BCE2DLoss(
+kernel void BCE2DLossFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbChannels,
@@ -3311,7 +3311,7 @@ kernel void BCE2DLoss(
     losses[elem] = tmp;
 }
 
-kernel void BCE2DLossDerivative(
+kernel void BCE2DLossDerivativeFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbChannels,
@@ -3380,7 +3380,7 @@ kernel void BCE2DLossDerivative(
     }
 }
 
-kernel void BCESigmoid2DLoss(
+kernel void BCESigmoid2DLossFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbChannels,
@@ -3441,7 +3441,7 @@ kernel void BCESigmoid2DLoss(
     losses[elem] = tmp;
 }
 
-kernel void BCESigmoid2DLossDerivative(
+kernel void BCESigmoid2DLossDerivativeFloat(
     const device float * outs,
     const device float * groundTruth,
     constant uint * pNbChannels,
@@ -3510,7 +3510,7 @@ kernel void BCESigmoid2DLossDerivative(
     }
 }
 
-kernel void layerCAM2DForward(
+kernel void layerCAM2DForwardFloat(
     const device float * outsPrev,
     const device float * deltaPrev,
     constant uint * pNbChannelsPrev,
diff --git a/Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal b/Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal
new file mode 100644
index 00000000..08fe23dc
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal
@@ -0,0 +1,3570 @@
+//
+// Layer2D.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void avgPoolForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbNeurons,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint heightPrev, widthPrev;
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pNbNeurons && pDimensionsPrev && pNbBatch &&
+        outsPrev && outs)
+    {
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+        
+    uint offsetStartPrev = (depth + nbNeurons * elem) * heightPrev;
+    
+    float tmp = 0.0;
+    for (uint i=0; i<heightPrev; i++){
+    for (uint j=0; j<widthPrev; j++)
+    {
+        uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+        tmp += outsPrev[offsetPrev];
+    }}
+    tmp /= heightPrev * widthPrev;
+    
+    uint offset = depth + nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void avgPoolBackwardHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint heightPrev, widthPrev;
+    uint nbNeurons;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbNeurons && pDimensionsPrev && pNbBatch && pDirty &&
+        delta && deltaPrev)
+    {
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depthPrev >= widthPrev * nbNeurons)
+    {
+        return ;
+    }
+    
+    uint offset = depthPrev + nbNeurons * elem;
+    float deltaCur = delta[offset];
+    
+    uint offsetStartPrev = (depthPrev + nbNeurons * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = deltaCur / (heightPrev * widthPrev);
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += deltaCur / (heightPrev * widthPrev);
+    }
+}
+
+kernel void maxPoolForwardHalf(
+    const device half * outsPrev,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    device int * indicesMax,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    int start, end;
+    uint stride;
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pStart && pStride && pNbChannels && pDimensions && pDimensionsPrev &&
+        pNbBatch && outsPrev && outs && indicesMax)
+    {
+        start = pStart[0];
+        end = pStart[1];
+        stride = pStride[0];
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+        
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    int indexMax = -1;
+    float maxVal = -10000.0;
+    for (int k=start; k<=end; k++){
+    for (int l=start; l<=end; l++)
+    {
+        if ((int)(stride*j)+l >= 0 &&
+            (int)(stride*j)+l < (int)widthPrev &&
+            (int)(stride*i)+k >= 0 &&
+            (int)(stride*i)+k < (int)heightPrev)
+        {
+            uint offsetPrev = (int)(stride*j)+l +
+                (offsetStartPrev + (int)(stride*i)+k)*widthPrev;
+            
+            float outPrev = outsPrev[offsetPrev];
+            if (outPrev > maxVal)
+            {
+                indexMax = offsetPrev;
+                indicesMax[offset] = offsetPrev;
+                maxVal = outPrev;
+            }
+        }
+    }}
+    
+    outs[offset] = maxVal;
+    indicesMax[offset] = indexMax;
+}
+
+kernel void maxPoolBackwardHalf(
+    const device half * delta,
+    const device int * indicesMax,
+    constant int * pStart,
+    constant uint * pStride,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    int start, end;
+    uint stride;
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pStart && pStride && pNbChannels && pDimensions && pDimensionsPrev &&
+        pNbBatch && pDirty && delta && indicesMax && deltaPrev)
+    {
+        start = pStart[0];
+        end = pStart[1];
+        stride = pStride[0];
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depth >= widthPrev * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    float tmp = 0.0;
+    for (int k=start; k<=end; k++){
+    for (int l=start; l<=end; l++)
+    {
+        int i1, j1;
+        // i-k rather than i+k to take into account non symetric kernels.
+        // Exemple: size of kernel 2 instead of 3.
+        if ((i-k) % stride != 0)
+        {
+            continue;
+        }
+        else if ((j-l) % stride != 0)
+        {
+            continue;
+        }
+        else
+        {
+            i1 = (i-k) / stride;
+            j1 = (j-l) / stride;
+        }
+        if (j1 >= 0 && j1 < (int)width &&
+            i1 >= 0 && i1 < (int)height)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = j1 + (offsetStart + i1) * width;
+            
+            if ((uint)indicesMax[offset] == offsetPrev)
+            {
+                tmp += delta[offset];
+            }
+        }
+    }}
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += tmp;
+    }
+}
+
+uint _startIndex(uint index, uint smallSize, uint bigSize)
+{
+    float val = float(index * bigSize) / smallSize;
+    val = round(val * 1000) / 1000;
+    return (uint)(floor(val));
+}
+
+uint _endIndex(uint index, uint smallSize, uint bigSize)
+{
+    return (uint)(ceil(float((index + 1) * bigSize) / smallSize));
+}
+
+kernel void adaptiveAvgPoolForward1Half(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pNbBatch &&
+        outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint startI = _startIndex(i, height, heightPrev);
+    uint endI = _endIndex(i, height, heightPrev);
+    uint startJ = _startIndex(j, width, widthPrev);
+    uint endJ = _endIndex(j, width, widthPrev);
+    
+    uint nbElemsI = endI - startI;
+    uint nbElemsJ = endJ - startJ;
+    uint nbElems = nbElemsI * nbElemsJ;
+        
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    
+    float tmp = 0.0;
+    for (uint k=0; k<nbElemsI; k++) {
+    for (uint l=0; l<nbElemsJ; l++)
+    {
+        uint offsetPrev = startJ+l + (offsetStartPrev + startI+k) * widthPrev;
+        tmp += outsPrev[offsetPrev];
+    }}
+    
+    uint offset = j + (offsetStart + i) * width;
+    outs[offset] = tmp / (float)nbElems;
+}
+
+kernel void adaptiveAvgPoolForward2Half(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device int * nbElems,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pNbBatch &&
+        outsPrev && nbElems && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbChannels || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    
+    for (uint i = 0; i < heightPrev; i++) {
+    for (uint j = 0; j < widthPrev; j++)
+    {
+        uint startI = _startIndex(i, heightPrev, height);
+        uint endI = _endIndex(i, heightPrev, height);
+        uint startJ = _startIndex(j, widthPrev, width);
+        uint endJ = _endIndex(j, widthPrev, width);
+        
+        uint nbElemsI = endI - startI;
+        uint nbElemsJ = endJ - startJ;
+        
+        uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+        float outPrev = outsPrev[offsetPrev];
+        
+        for (uint k = 0; k < nbElemsI; k++){
+        for (uint l = 0; l < nbElemsJ; l++)
+        {
+            uint offset = startJ+l + (offsetStart + startI+k) * width;
+            
+            outs[offset] += outPrev;
+            nbElems[offset] += 1;
+        }}
+    }}
+    
+    for (uint I = 0; I < height; I++){
+    for (uint J = 0; J < width; J++)
+    {
+        uint offset = J + (offsetStart + I) * width;
+        outs[offset] /= nbElems[offset];
+    }}
+}
+
+kernel void adaptiveAvgPoolBackward1Half(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pNbBatch &&
+        delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbChannels || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    
+    for (uint i = 0; i < height; i++) {
+    for (uint j = 0; j < width; j++)
+    {
+        uint startI = _startIndex(i, height, heightPrev);
+        uint endI = _endIndex(i, height, heightPrev);
+        uint startJ = _startIndex(j, width, widthPrev);
+        uint endJ = _endIndex(j, width, widthPrev);
+        
+        uint nbElemsI = endI - startI;
+        uint nbElemsJ = endJ - startJ;
+        uint nbElems = nbElemsI * nbElemsJ;
+        
+        uint offset = j + (offsetStart + i) * width;
+        float deltaCur = delta[offset] / (float)nbElems;
+        
+        for (uint k = 0; k < nbElemsI; k++){
+        for (uint l = 0; l < nbElemsJ; l++)
+        {
+            uint offsetPrev = startJ+l +
+                (offsetStartPrev + startI+k) * widthPrev;
+            deltaPrev[offsetPrev] += deltaCur;
+        }}
+    }}
+}
+
+kernel void adaptiveAvgPoolBackward2Half(
+    const device half * delta,
+    const device int * nbElems,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pNbBatch &&
+        delta && nbElems && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbChannels || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    
+    for (uint i = 0; i < heightPrev; i++) {
+    for (uint j = 0; j < widthPrev; j++)
+    {
+        uint startI = _startIndex(i, heightPrev, height);
+        uint endI = _endIndex(i, heightPrev, height);
+        uint startJ = _startIndex(j, widthPrev, width);
+        uint endJ = _endIndex(j, widthPrev, width);
+        
+        uint nbElemsI = endI - startI;
+        uint nbElemsJ = endJ - startJ;
+        
+        uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+        
+        for (uint k = 0; k < nbElemsI; k++){
+        for (uint l = 0; l < nbElemsJ; l++)
+        {
+            uint offset = startJ+l + (offsetStart + startI+k) * width;
+            deltaPrev[offsetPrev] += delta[offset] / nbElems[offset];
+        }}
+    }}
+}
+
+kernel void selectNeurons2DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pTarget,
+    constant uint * pNbNeurons,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint targetI, targetJ;
+    uint heightPrev, widthPrev;
+    uint nbNeurons;
+    uint nbBatch;
+    
+    if (pTarget && pNbNeurons && pDimensionsPrev && pNbBatch &&
+        outsPrev && outs)
+    {
+        targetI = pTarget[0];
+        targetJ = pTarget[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+        
+    uint offsetStartPrev = (depth + nbNeurons * elem) * heightPrev;
+    uint offsetPrev = targetJ +
+        (offsetStartPrev + targetI) * widthPrev;
+    uint offset = depth + nbNeurons * elem;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void selectNeurons2DBackwardHalf(
+    const device half * delta,
+    constant uint * pTarget,
+    constant uint * pNbNeurons,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint targetI, targetJ;
+    uint heightPrev, widthPrev;
+    uint nbNeurons;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pTarget && pNbNeurons && pDimensionsPrev && pNbBatch && pDirty &&
+        delta && deltaPrev)
+    {
+        targetI = pTarget[0];
+        targetJ = pTarget[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depthPrev >= widthPrev * nbNeurons)
+    {
+        return ;
+    }
+    
+    float deltaCur = 0.0;
+    if (i == targetI && j == targetJ)
+    {
+        uint offset = depthPrev + nbNeurons * elem;
+        deltaCur = delta[offset];
+    }
+    
+    uint offsetStartPrev = (depthPrev + nbNeurons * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = deltaCur;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += deltaCur;
+    }
+}
+
+kernel void IRDFT2RGBForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch && outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+        
+    uint offsetStartRealPrev = (2 * depth + 2 * nbChannels * elem) * height;
+    uint offsetStartImPrev = (2 * depth + 1 + 2 * nbChannels * elem) * height;
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float sum = 0.0;
+    for (uint k=0; k<height; k++){
+    for (uint l=0; l<width; l++)
+    {
+        uint offsetRealPrev = l + (offsetStartRealPrev + k) * width;
+        uint offsetImPrev = l + (offsetStartImPrev + k) * width;
+        
+        float angle = 2.0 * M_PI_F;
+        angle *= (float(i) / height * k + float(j) / width * l);
+        
+        sum += outsPrev[offsetRealPrev] * cos(angle) -
+            outsPrev[offsetImPrev] * sin(angle);
+    }}
+    sum /= float(height * width);
+    outs[offset] = sum;
+}
+
+kernel void IRDFT2RGBBackwardHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions &&
+        pNbBatch && pDirty && delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStartRealPrev = (2 * depth + 2 * nbChannels * elem) * height;
+    uint offsetStartImPrev = (2 * depth + 1 + 2 * nbChannels * elem) * height;
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetRealPrev = j + (offsetStartRealPrev + i) * width;
+    uint offsetImPrev = j + (offsetStartImPrev + i) * width;
+    
+    float sum1 = 0.0;
+    float sum2 = 0.0;
+    for (uint k=0; k<height; k++){
+    for (uint l=0; l<width; l++)
+    {
+        uint offset = l + (offsetStart + k) * width;
+        float deltaCur = delta[offset];
+        
+        float angle = 2.0 * M_PI_F;
+        angle *= (float(i) / height * k + float(j) / width * l);
+        
+        sum1 += deltaCur * cos(angle);
+        sum2 -= deltaCur * sin(angle);
+    }}
+    sum1 /= float(height * width);
+    sum2 /= float(height * width);
+    
+    if (dirty)
+    {
+        deltaPrev[offsetRealPrev] = sum1;
+        deltaPrev[offsetImPrev] = sum2;
+    }
+    else
+    {
+        deltaPrev[offsetRealPrev] += sum1;
+        deltaPrev[offsetImPrev] += sum2;
+    }
+}
+
+kernel void decorrelateRGBForwardHalf(
+    const device half * outsPrev,
+    constant float * correlation,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch &&
+        outsPrev && correlation && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint block = depth / 3;
+    uint res = depth % 3;
+        
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float sum = 0.0;
+    for (uint k=0; k<3; k++)
+    {
+        uint offsetStartPrev = (block * 3 + k + nbChannels * elem) * height;
+        uint offsetPrev = j + (offsetStartPrev + i) * width;
+        
+        sum += outsPrev[offsetPrev] * correlation[res * 3 + k];
+    }
+    outs[offset] = sum;
+}
+
+kernel void decorrelateRGBBackwardHalf(
+    const device half * delta,
+    constant float * correlation,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pDirty &&
+        delta && correlation && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint block = depth / 3;
+    uint res = depth % 3;
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * height;
+    uint offsetPrev = j + (offsetStartPrev + i) * width;
+    
+    float sum = 0.0;
+    for (uint k=0; k<3; k++)
+    {
+        uint offsetStart = (block * 3 + k + nbChannels * elem) * height;
+        uint offset = j + (offsetStart + i) * width;
+        
+        sum += delta[offset] * correlation[k * 3 + res];
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = sum;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += sum;
+    }
+}
+
+kernel void linearScale2DForwardHalf(
+    const device half * outsPrev,
+    constant float * weights,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch &&
+        outsPrev && weights && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+        
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    outs[offset] = weights[0] * outsPrev[offset] + weights[1];
+}
+
+kernel void linearScale2DBackwardHalf(
+    const device half * delta,
+    constant float * weights,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pDirty &&
+        delta && weights && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * height;
+    uint offsetPrev = j + (offsetStartPrev + i) * width;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offsetPrev] * weights[0];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offsetPrev] * weights[0];
+    }
+}
+
+float _getScaleValue(
+    const uint i,
+    const uint j,
+    const uint dimension)
+{
+    float freq = sqrt(float(i * i + j * j)) / float(dimension);
+    freq = max(freq, 1.0 / float(dimension));
+    return (1.0 / freq) * float(dimension);
+}
+
+kernel void setDataFTFrequences2DHalf(
+    constant uint * pNbChannels,
+    constant uint * pDimension,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint dimension;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimension && pNbBatch && outs)
+    {
+        dimension = *pDimension;
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / dimension;
+    uint elem = id[1] / dimension;
+    uint i = id[1] % dimension;
+    uint j = id[0] % dimension;
+    
+    if (i * elem >= dimension * nbBatch ||
+        j * depth >= dimension * nbChannels)
+    {
+        return ;
+    }
+        
+    uint end = dimension % 2 == 0 ? dimension / 2 : (dimension - 1) / 2;
+    uint jTmp = j;
+    uint iTmp = i;
+    if (dimension % 2 == 0)
+    {
+        if (jTmp >= end)
+        {
+            jTmp = jTmp - end + 1;
+            jTmp = end + 1 - jTmp;
+        }
+        if (iTmp >= end)
+        {
+            iTmp = iTmp - end + 1;
+            iTmp = end + 1 - iTmp;
+        }
+    }
+    else
+    {
+        if (jTmp > end)
+        {
+            jTmp = jTmp - end;
+            jTmp = end + 1 - jTmp;
+        }
+        if (iTmp > end)
+        {
+            iTmp = iTmp - end;
+            iTmp = end + 1 - iTmp;
+        }
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * dimension;
+    uint offset = j + (offsetStart + i) * dimension;
+
+    outs[offset] = _getScaleValue(iTmp, jTmp, dimension);
+}
+
+kernel void pad2DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pPadDimension,
+    constant float * pPadValue,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint padDimension;
+    float padValue;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch &&
+        pPadDimension && pPadValue && outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        padDimension = *pPadDimension;
+        padValue = *pPadValue;
+        widthPrev = width - 2 * padDimension;
+        heightPrev = height - 2 * padDimension;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+        
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    if (i < padDimension || i >= height - padDimension ||
+        j < padDimension || j >= width - padDimension)
+    {
+        outs[offset] = padValue;
+    }
+    else
+    {
+        uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+        uint offsetPrev = j-padDimension +
+            (offsetStartPrev + i-padDimension) * widthPrev;
+        
+        outs[offset] = outsPrev[offsetPrev];
+    }
+}
+
+kernel void pad2DBackwardHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pPadDimension,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint padDimension;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pPadDimension && pNbBatch && pDirty &&
+        delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        padDimension = *pPadDimension;
+        widthPrev = width - 2 * padDimension;
+        heightPrev = height - 2 * padDimension;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depth >= widthPrev * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j+padDimension +
+        (offsetStart + i+padDimension) * width;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void crop2DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pCropDimension,
+    constant uint * pCropOffsets,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint cropDimension;
+    uint offsetI, offsetJ;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch &&
+        pCropDimension && pCropOffsets && outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        cropDimension = *pCropDimension;
+        offsetJ = pCropOffsets[0];
+        offsetI = pCropOffsets[1];
+        widthPrev = width + cropDimension;
+        heightPrev = height + cropDimension;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+        
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetPrev = j+offsetJ +
+        (offsetStartPrev + i+offsetI) * widthPrev;
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void crop2DBackwardHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pCropDimension,
+    constant uint * pCropOffsets,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint nbChannels;
+    uint cropDimension;
+    uint offsetI, offsetJ;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch &&
+        pCropDimension && pCropOffsets && pDirty &&
+        delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        cropDimension = *pCropDimension;
+        offsetJ = pCropOffsets[0];
+        offsetI = pCropOffsets[1];
+        widthPrev = width + cropDimension;
+        heightPrev = height + cropDimension;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depth >= widthPrev * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    if (dirty &&
+        (i < offsetI || i >= height + offsetI ||
+         j < offsetJ || j >= width + offsetJ))
+    {
+        deltaPrev[offsetPrev] = 0.0;
+    }
+    else if (dirty)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        uint offset = j-offsetJ + (offsetStart + i-offsetI) * width;
+        
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else if (i >= offsetI && i < height + offsetI &&
+             j >= offsetJ && j < width + offsetJ)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        uint offset = j-offsetJ + (offsetStart + i-offsetI) * width;
+        
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void resizeBilinearPadForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimensionsResize,
+    constant uint * pPadDimensions,
+    constant float * pPadValue,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint heightResize, widthResize;
+    uint nbChannels;
+    uint padStartI, padEndI;
+    uint padStartJ, padEndJ;
+    float padValue;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pDimensionsResize &&
+        pPadDimensions && pPadValue && pNbBatch && outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        widthResize = pDimensionsResize[0];
+        heightResize = pDimensionsResize[1];
+        padStartI = pPadDimensions[0];
+        padEndI = pPadDimensions[1];
+        padStartJ = pPadDimensions[2];
+        padEndJ = pPadDimensions[3];
+        padValue = *pPadValue;
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float ratioInOutI = float(heightPrev - 1) / float(heightResize - 1);
+    float ratioInOutJ = float(widthPrev - 1) / float(widthResize - 1);
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    if (i < padStartI || i >= height - padEndI ||
+        j < padStartJ || j >= width - padEndJ)
+    {
+        outs[offset] = padValue;
+    }
+    else
+    {
+        float I = i-padStartI;
+        float J = j-padStartJ;
+        
+        float iPrev = I * ratioInOutI;
+        float jPrev = J * ratioInOutJ;
+        
+        uint iPrevInf = floor(iPrev);
+        uint iPrevSup = ceil(iPrev);
+        uint jPrevInf = floor(jPrev);
+        uint jPrevSup = ceil(jPrev);
+        
+        float iWeight = ratioInOutI * I - iPrevInf;
+        float jWeight = ratioInOutJ * J - jPrevInf;
+        
+        uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+        uint offsetPrev11 = jPrevInf + (offsetStartPrev + iPrevInf) * widthPrev;
+        uint offsetPrev12 = jPrevSup + (offsetStartPrev + iPrevInf) * widthPrev;
+        uint offsetPrev21 = jPrevInf + (offsetStartPrev + iPrevSup) * widthPrev;
+        uint offsetPrev22 = jPrevSup + (offsetStartPrev + iPrevSup) * widthPrev;
+        
+        float out = outsPrev[offsetPrev11] * (1.0 - iWeight) * (1.0 - jWeight);
+        out += outsPrev[offsetPrev12] * (1.0 - iWeight) * jWeight;
+        out += outsPrev[offsetPrev21] * iWeight * (1.0 - jWeight);
+        out += outsPrev[offsetPrev22] * iWeight * jWeight;
+        
+        outs[offset] = out;
+    }
+}
+
+kernel void resizeBilinearPadBackwardHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimensionsResize,
+    constant uint * pPadDimensions,
+    constant uint * pNbBatch,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint heightResize, widthResize;
+    uint nbChannels;
+    uint padStartI, padEndI;
+    uint padStartJ, padEndJ;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pDimensionsResize &&
+        pPadDimensions && pNbBatch && delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        widthResize = pDimensionsResize[0];
+        heightResize = pDimensionsResize[1];
+        padStartI = pPadDimensions[0];
+        padEndI = pPadDimensions[1];
+        padStartJ = pPadDimensions[2];
+        padEndJ = pPadDimensions[3];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depth >= widthPrev * nbChannels)
+    {
+        return ;
+    }
+    
+    float ratioInOutI = float(heightPrev - 1) / float(heightResize - 1);
+    float ratioInOutJ = float(widthPrev - 1) / float(widthResize - 1);
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    float kLow = (i-1.0) / ratioInOutI;
+    float kHigh = (i+1.0) / ratioInOutI;
+    float lLow = (j-1.0) / ratioInOutJ;
+    float lHigh = (j+1.0) / ratioInOutJ;
+    
+    int kStart = ceil(kLow);
+    int kEnd = floor(kHigh);
+    int lStart = ceil(lLow);
+    int lEnd = floor(lHigh);
+    
+    for (int k = kStart; k <= kEnd; k++) {
+    for (int l = lStart; l <= lEnd; l++)
+    {
+        if (k >= 0 && k < (int)heightResize &&
+            l >= 0 && l < (int)widthResize)
+        {
+            float kPrev = k * ratioInOutI;
+            float lPrev = l * ratioInOutJ;
+            
+            uint kPrevInf = floor(kPrev);
+            uint kPrevSup = ceil(kPrev);
+            uint lPrevInf = floor(lPrev);
+            uint lPrevSup = ceil(lPrev);
+            
+            float kWeight = ratioInOutI * k - kPrevInf;
+            float lWeight = ratioInOutJ * l - lPrevInf;
+            
+            if (kPrevInf == i && lPrevInf == j)
+            {
+                uint offset = l+padStartJ +
+                    (offsetStart + k+padStartI) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] +=
+                    deltaCur * (1.0 - kWeight) * (1.0 - lWeight);
+            }
+            else if (kPrevInf == i && lPrevSup == j)
+            {
+                uint offset = l+padStartJ +
+                    (offsetStart + k+padStartI) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] += deltaCur * (1.0 - kWeight) * lWeight;
+            }
+            else if (kPrevSup == i && lPrevInf == j)
+            {
+                uint offset = l+padStartJ +
+                    (offsetStart + k+padStartI) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] += deltaCur * kWeight * (1.0 - lWeight);
+            }
+            else if (kPrevSup == i && lPrevSup == j)
+            {
+                uint offset = l+padStartJ +
+                    (offsetStart + k+padStartI) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] += deltaCur * kWeight * lWeight;
+            }
+        }
+    }}
+}
+
+kernel void rotate2DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant float * pAngle,
+    constant float * pPadValue,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    float angle, padValue;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pAngle && pPadValue && pNbBatch &&
+        outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        angle = *pAngle;
+        padValue = *pPadValue;
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float centerI = float(height - 1) / 2.0;
+    float centerJ = float(width - 1) / 2.0;
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float prevJ =
+        cos(-angle) * (float(j) - centerJ) +
+        sin(-angle) * (float(i) - centerI) + centerJ;
+    float prevI =
+        cos(-angle) * (float(i) - centerI) -
+        sin(-angle) * (float(j) - centerJ) + centerI;
+    
+    if (round(prevJ) < 0 || round(prevJ) >= float(width) ||
+        round(prevI) < 0 || round(prevI) >= float(height))
+    {
+        outs[offset] = padValue;
+    }
+    else
+    {
+        uint offsetPrev = round(prevJ) + (offsetStart + round(prevI)) * width;
+        outs[offset] = outsPrev[offsetPrev];
+    }
+}
+
+kernel void rotate2DBackwardHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant float * pAngle,
+    constant uint * pNbBatch,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    float angle;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pAngle && pNbBatch &&
+        delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        angle = *pAngle;
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float centerI = float(height - 1) / 2.0;
+    float centerJ = float(width - 1) / 2.0;
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetPrev = j + (offsetStart + i) * width;
+    
+    float rotJ =
+        cos(angle) * (float(j) - centerJ) +
+        sin(angle) * (float(i) - centerI) + centerJ;
+    float rotI =
+        cos(angle) * (float(i) - centerI) -
+        sin(angle) * (float(j) - centerJ) + centerI;
+    
+    for (int k = floor(rotI); k <= ceil(rotI); k++) {
+    for (int l = floor(rotJ); l <= ceil(rotJ); l++)
+    {
+        float prevL =
+            cos(-angle) * (float(l) - centerJ) +
+            sin(-angle) * (float(k) - centerI) + centerJ;
+        float prevK =
+            cos(-angle) * (float(k) - centerI) -
+            sin(-angle) * (float(l) - centerJ) + centerI;
+        
+        if (round(prevL) == j && round(prevK) == i &&
+            l >= 0 && l < (int)width && k >= 0 && k < (int)height)
+        {
+            uint offset = l + (offsetStart + k) * width;
+            deltaPrev[offsetPrev] += delta[offset];
+        }
+    }}
+}
+
+kernel void resizeBilinearCropForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimensions2Resize,
+    constant uint * pCropOffsets,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint height2Resize, width2Resize;
+    uint offsetI, offsetJ;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pDimensions2Resize &&
+        pCropOffsets && pNbBatch && outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        width2Resize = pDimensions2Resize[0];
+        height2Resize = pDimensions2Resize[1];
+        offsetJ = pCropOffsets[0];
+        offsetI = pCropOffsets[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float ratioInOutI = float(height2Resize - 1) / float(height - 1);
+    float ratioInOutJ = float(width2Resize - 1) / float(width - 1);
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float iPrev = i * ratioInOutI;
+    float jPrev = j * ratioInOutJ;
+    
+    uint iPrevInf = floor(iPrev);
+    uint iPrevSup = ceil(iPrev);
+    uint jPrevInf = floor(jPrev);
+    uint jPrevSup = ceil(jPrev);
+    
+    float iWeight = ratioInOutI * i - iPrevInf;
+    float jWeight = ratioInOutJ * j - jPrevInf;
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetPrev11 = jPrevInf+offsetJ +
+        (offsetStartPrev + iPrevInf+offsetI) * widthPrev;
+    uint offsetPrev12 = jPrevSup+offsetJ +
+        (offsetStartPrev + iPrevInf+offsetI) * widthPrev;
+    uint offsetPrev21 = jPrevInf+offsetJ +
+        (offsetStartPrev + iPrevSup+offsetI) * widthPrev;
+    uint offsetPrev22 = jPrevSup+offsetJ +
+        (offsetStartPrev + iPrevSup+offsetI) * widthPrev;
+    
+    float out = outsPrev[offsetPrev11] * (1.0 - iWeight) * (1.0 - jWeight);
+    out += outsPrev[offsetPrev12] * (1.0 - iWeight) * jWeight;
+    out += outsPrev[offsetPrev21] * iWeight * (1.0 - jWeight);
+    out += outsPrev[offsetPrev22] * iWeight * jWeight;
+    
+    outs[offset] = out;
+}
+
+kernel void resizeBilinearCropBackwardHalf(
+    const device half * delta,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pDimensionsPrev,
+    constant uint * pDimensions2Resize,
+    constant uint * pCropOffsets,
+    constant uint * pNbBatch,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint heightPrev, widthPrev;
+    uint height2Resize, width2Resize;
+    uint offsetI, offsetJ;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pDimensionsPrev && pDimensions2Resize &&
+        pCropOffsets && pNbBatch && delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        width2Resize = pDimensions2Resize[0];
+        height2Resize = pDimensions2Resize[1];
+        offsetJ = pCropOffsets[0];
+        offsetI = pCropOffsets[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depth >= widthPrev * nbChannels)
+    {
+        return ;
+    }
+    if (i < offsetI || i >= height2Resize + offsetI ||
+        j < offsetJ || j >= width2Resize + offsetJ)
+    {
+        return ;
+    }
+    
+    float ratioInOutI = float(height2Resize - 1) / float(height - 1);
+    float ratioInOutJ = float(width2Resize - 1) / float(width - 1);
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    float I = i-offsetI;
+    float J = j-offsetJ;
+        
+    float kLow = (I-1.0) / ratioInOutI;
+    float kHigh = (I+1.0) / ratioInOutI;
+    float lLow = (J-1.0) / ratioInOutJ;
+    float lHigh = (J+1.0) / ratioInOutJ;
+    
+    int kStart = ceil(kLow);
+    int kEnd = floor(kHigh);
+    int lStart = ceil(lLow);
+    int lEnd = floor(lHigh);
+    
+    for (int k = kStart; k <= kEnd; k++) {
+    for (int l = lStart; l <= lEnd; l++)
+    {
+        if (k >= 0 && k < (int)height &&
+            l >= 0 && l < (int)width)
+        {
+            float kPrev = k * ratioInOutI;
+            float lPrev = l * ratioInOutJ;
+            
+            uint kPrevInf = floor(kPrev);
+            uint kPrevSup = ceil(kPrev);
+            uint lPrevInf = floor(lPrev);
+            uint lPrevSup = ceil(lPrev);
+            
+            float kWeight = ratioInOutI * k - kPrevInf;
+            float lWeight = ratioInOutJ * l - lPrevInf;
+            
+            if (kPrevInf == I && lPrevInf == J)
+            {
+                uint offset = l + (offsetStart + k) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] +=
+                    deltaCur * (1.0 - kWeight) * (1.0 - lWeight);
+            }
+            else if (kPrevInf == I && lPrevSup == J)
+            {
+                uint offset = l + (offsetStart + k) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] += deltaCur * (1.0 - kWeight) * lWeight;
+            }
+            else if (kPrevSup == I && lPrevInf == J)
+            {
+                uint offset = l + (offsetStart + k) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] += deltaCur * kWeight * (1.0 - lWeight);
+            }
+            else if (kPrevSup == I && lPrevSup == J)
+            {
+                uint offset = l + (offsetStart + k) * width;
+                float deltaCur = delta[offset];
+                
+                deltaPrev[offsetPrev] += deltaCur * kWeight * lWeight;
+            }
+        }
+    }}
+}
+
+kernel void concat02DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint globalOffset;
+    
+    if (pGlobalOffset && pNbChannels && pDimensions &&
+        pNbBatch && outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        globalOffset = *pGlobalOffset;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * height;
+    uint offsetStart = (depth + nbChannels * (globalOffset+elem)) * height;
+    
+    uint offsetPrev = j + (offsetStartPrev + i) * width;
+    uint offset = j + (offsetStart + i) * width;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void concat02DBackwardHalf(
+    const device half * delta,
+    constant uint * pGlobalOffset,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint globalOffset;
+    uint dirty;
+    
+    if (pGlobalOffset && pNbChannels && pDimensions &&
+        pNbBatch && pDirty && delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        globalOffset = *pGlobalOffset;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depth + nbChannels * elem) * height;
+    uint offsetStart = (depth + nbChannels * (globalOffset+elem)) * height;
+    
+    uint offsetPrev = j + (offsetStartPrev + i) * width;
+    uint offset = j + (offsetStart + i) * width;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void concat12DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    uint nbBatch;
+    uint globalOffset;
+    
+    if (pGlobalOffset && pNbChannels && pNbChannelsPrev && pDimensions &&
+        pNbBatch && outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        globalOffset = *pGlobalOffset;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depthPrev >= width * nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * height;
+    uint offsetStart = (globalOffset+depthPrev + nbChannels * elem) * height;
+    
+    uint offsetPrev = j + (offsetStartPrev + i) * width;
+    uint offset = j + (offsetStart + i) * width;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void concat12DBackwardHalf(
+    const device half * delta,
+    constant uint * pGlobalOffset,
+    constant uint * pNbChannels,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbChannelsPrev;
+    uint nbBatch;
+    uint globalOffset;
+    uint dirty;
+    
+    if (pGlobalOffset && pNbChannels && pNbChannelsPrev && pDimensions &&
+        pNbBatch && pDirty && delta && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        globalOffset = *pGlobalOffset;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depthPrev >= width * nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * height;
+    uint offsetStart = (globalOffset+depthPrev + nbChannels * elem) * height;
+    
+    uint offsetPrev = j + (offsetStartPrev + i) * width;
+    uint offset = j + (offsetStart + i) * width;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void constant2DForwardHalf(
+    const device half * weights,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch && weights && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    outs[offset] = weights[depth];
+}
+
+kernel void MSE2DLossHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch && outs && groundTruth && losses)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (uint i=0; i<height; i++) {
+        for (uint j=0; j<width; j++)
+        {
+            uint offset = j + (offsetStart + i) * width;
+            
+            float out = outs[offset];
+            float gt = groundTruth[offset];
+            float diff = out - gt;
+            
+            tmp += diff * diff;
+        }}
+    }
+    
+    losses[elem] = tmp;
+}
+
+kernel void MSE2DLossDerivativeHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pCoeff && pDirty &&
+        outs && groundTruth && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float gt = groundTruth[offset];
+    float out = outs[offset];
+    float diff = out - gt;
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = 2 * coeff * diff /
+            float(nbBatch * nbChannels * height * width);
+    }
+    else
+    {
+        deltaPrev[offset] += 2 * coeff * diff /
+            float(nbBatch * nbChannels * height * width);
+    }
+}
+
+kernel void selfCorrelate2DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint heightPrev, widthPrev;
+    uint nbChannelsPrev;
+    uint nbBatch;
+    
+    if (pNbChannelsPrev && pDimensionsPrev && pNbBatch &&
+        outsPrev && outs)
+    {
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint channel1 = id[0] / nbChannelsPrev;
+    uint channel2 = id[0] % nbChannelsPrev;
+    uint elem = id[1];
+    
+    if (channel1 * channel2 >= nbChannelsPrev * nbChannelsPrev ||
+        elem >= nbBatch)
+    {
+        return ;
+    }
+        
+    uint offsetStart1 = (channel1 + nbChannelsPrev * elem) * heightPrev;
+    uint offsetStart2 = (channel2 + nbChannelsPrev * elem) * heightPrev;
+    
+    float correlation = 0.0;
+    for (uint i=0; i<heightPrev; i++){
+    for (uint j=0; j<widthPrev; j++)
+    {
+        uint offset1 = j + (offsetStart1 + i) * widthPrev;
+        uint offset2 = j + (offsetStart2 + i) * widthPrev;
+        
+        correlation += outsPrev[offset1] * outsPrev[offset2];
+    }}
+    
+    uint offset = channel2 +
+        (elem * nbChannelsPrev + channel1) * nbChannelsPrev;
+    outs[offset] = correlation;
+}
+
+kernel void selfCorrelate2DBackwardHalf(
+    const device half * delta,
+    const device half * outsPrev,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensionsPrev,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint heightPrev, widthPrev;
+    uint nbChannelsPrev;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannelsPrev && pDimensionsPrev && pNbBatch && pDirty &&
+        delta && outsPrev && deltaPrev)
+    {
+        widthPrev = pDimensionsPrev[0];
+        heightPrev = pDimensionsPrev[1];
+        nbChannelsPrev = *pNbChannelsPrev;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depthPrev = id[0] / widthPrev;
+    uint elem = id[1] / heightPrev;
+    uint i = id[1] % heightPrev;
+    uint j = id[0] % widthPrev;
+    
+    if (i * elem >= heightPrev * nbBatch ||
+        j * depthPrev >= widthPrev * nbChannelsPrev)
+    {
+        return ;
+    }
+    
+    float correlation = 0.0;
+    for (uint col=0; col<nbChannelsPrev; col++)
+    {
+        uint offsetStartPrev = (col + nbChannelsPrev * elem) * heightPrev;
+        uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+        uint offset = col +
+            (elem * nbChannelsPrev + depthPrev) * nbChannelsPrev;
+        
+        correlation += delta[offset] * outsPrev[offsetPrev];
+    }
+    for (uint row=0; row<nbChannelsPrev; row++)
+    {
+        uint offsetStartPrev = (row + nbChannelsPrev * elem) * heightPrev;
+        uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+        uint offset = depthPrev +
+            (elem * nbChannelsPrev + row) * nbChannelsPrev;
+        
+        correlation += delta[offset] * outsPrev[offsetPrev];
+    }
+    
+    uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev;
+    uint offsetPrev = j + (offsetStartPrev + i) * widthPrev;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = correlation;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += correlation;
+    }
+}
+
+kernel void normalize12DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch &&
+        outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float norm = 0.0;
+    for (uint depth1=0; depth1<nbChannels; depth1++)
+    {
+        uint offsetStart1 = (depth1 + nbChannels * elem) * height;
+        uint offset1 = j + (offsetStart1 + i) * width;
+        
+        float outPrev1 = outsPrev[offset1];
+        norm += outPrev1 * outPrev1;
+    }
+    norm = sqrt(norm);
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float outPrev = outsPrev[offset];
+    outs[offset] = outPrev / max(norm, 1e-12);
+}
+
+kernel void normalize12DBackwardHalf(
+    const device half * delta,
+    const device half * outsPrev,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pDirty &&
+        delta && outsPrev && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float normTmp = 0.0;
+    for (uint depth1=0; depth1<nbChannels; depth1++)
+    {
+        uint offsetStart1 = (depth1 + nbChannels * elem) * height;
+        uint offset1 = j + (offsetStart1 + i) * width;
+        
+        float outPrev1 = outsPrev[offset1];
+        normTmp += outPrev1 * outPrev1;
+    }
+    float norm = sqrt(normTmp);
+    normTmp = pow(norm, 3);
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float deltaCur = delta[offset];
+    float outPrev = outsPrev[offset];
+    
+    float newValue = 0.0;
+    if (norm > 1e-12)
+    {
+        for (uint depth1=0; depth1<nbChannels; depth1++)
+        {
+            uint offsetStart1 = (depth1 + nbChannels * elem) * height;
+            uint offset1 = j + (offsetStart1 + i) * width;
+            
+            float deltaCur1 = delta[offset1];
+            float outPrev1 = outsPrev[offset1];
+            
+            newValue -= outPrev1 * outPrev / normTmp * deltaCur1;
+        }
+        newValue += deltaCur / norm;
+    }
+    else
+    {
+        newValue = deltaCur / 1e-12;
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = newValue;
+    }
+    else
+    {
+        deltaPrev[offset] += newValue;
+    }
+}
+
+kernel void computeSquaredNorm122DHalf(
+     const device half * outsPrev,
+     constant uint * pNbChannels,
+     constant uint * pDimensions,
+     constant uint * pNbThreadgroups,
+     constant uint * pNbBatch,
+     device half * squaredNorms,
+     uint2 groupId [[ threadgroup_position_in_grid ]],
+     uint2 threadId [[ thread_position_in_threadgroup ]],
+     uint2 id [[ thread_position_in_grid ]])
+{
+    constexpr uint threadsPerThreadgroup = 64;
+    threadgroup float normShared[threadsPerThreadgroup];
+    
+    uint height, width;
+    uint nbChannels;
+    uint nbThreadgroups;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch &&
+        outsPrev && squaredNorms)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbThreadgroups = *pNbThreadgroups;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint remains = id[0];
+    uint depth = remains / (height * width);
+    remains = remains % (height * width);
+    uint i = remains / width;
+    uint j = remains % width;
+    
+    if (depth * i * j >= nbChannels * height * width ||
+        elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float outPrev = outsPrev[offset];
+    normShared[threadId[0]] = outPrev * outPrev;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1)
+    {
+        uint index = threadId[0] + groupId[0] * threadsPerThreadgroup;
+        if (threadId[0] < stride &&
+            (index + stride) < nbChannels * height * width)
+        {
+            normShared[threadId[0]] += normShared[threadId[0] + stride];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    
+    if (threadId[0] == 0)
+    {
+        uint offset = elem * nbThreadgroups + groupId[0];
+        squaredNorms[offset] = normShared[0];
+    }
+}
+
+kernel void normalize122DForwardHalf(
+    const device half * outsPrev,
+    const device half * squaredNorms,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbThreadgroups,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbThreadgroups;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch &&
+        outsPrev && squaredNorms && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbThreadgroups = *pNbThreadgroups;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float norm = sqrt(squaredNorms[elem]);
+    float outPrev = outsPrev[offset];
+    
+    outs[offset] = outPrev / max(norm, 1e-12);
+}
+
+kernel void computeDeltaTmp122DHalf(
+     const device half * delta,
+     const device half * outsPrev,
+     const device half * squaredNorms,
+     constant uint * pNbChannels,
+     constant uint * pDimensions,
+     constant uint * pNbThreadgroups,
+     constant uint * pNbBatch,
+     device half * deltaTmp,
+     uint2 groupId [[ threadgroup_position_in_grid ]],
+     uint2 threadId [[ thread_position_in_threadgroup ]],
+     uint2 id [[ thread_position_in_grid ]])
+{
+    constexpr uint threadsPerThreadgroup = 64;
+    threadgroup float deltaShared[threadsPerThreadgroup];
+    
+    uint height, width;
+    uint nbChannels;
+    uint nbThreadgroups;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch &&
+        delta && outsPrev && squaredNorms && deltaTmp)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbThreadgroups = *pNbThreadgroups;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint remains = id[0];
+    uint depth = remains / (height * width);
+    remains = remains % (height * width);
+    uint i = remains / width;
+    uint j = remains % width;
+    
+    if (depth * i * j >= nbChannels * height * width ||
+        elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float norm = sqrt(squaredNorms[elem]);
+    if (norm > 1e-12)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        uint offset = j + (offsetStart + i) * width;
+        
+        float deltaCur = delta[offset];
+        float outPrev = outsPrev[offset];
+        
+        deltaShared[threadId[0]] = outPrev * deltaCur;
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        
+        for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1)
+        {
+            uint index = threadId[0] + groupId[0] * threadsPerThreadgroup;
+            if (threadId[0] < stride &&
+                (index + stride) < nbChannels * height * width)
+            {
+                deltaShared[threadId[0]] += deltaShared[threadId[0] + stride];
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+        }
+        
+        if (threadId[0] == 0)
+        {
+            uint offset = elem * nbThreadgroups + groupId[0];
+            deltaTmp[offset] = deltaShared[0];
+        }
+    }
+}
+
+kernel void normalize122DBackwardHalf(
+    const device half * delta,
+    const device half * outsPrev,
+    const device half * squaredNorms,
+    const device half * deltaTmp,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbThreadgroups,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbThreadgroups;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch && pDirty &&
+        delta && outsPrev && squaredNorms && deltaTmp && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbThreadgroups = *pNbThreadgroups;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    float norm = sqrt(squaredNorms[elem]);
+    float deltaCurTmp = deltaTmp[elem];
+    float normTmp = pow(norm, 3);
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float outPrev = outsPrev[offset];
+    float deltaCur = delta[offset];
+    
+    float newValue = 0.0;
+    if (norm > 1e-12)
+    {
+        newValue = deltaCur / norm - deltaCurTmp * outPrev / normTmp;
+    }
+    else
+    {
+        newValue = deltaCur / 1e-12;
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = newValue;
+    }
+    else
+    {
+        deltaPrev[offset] += newValue;
+    }
+}
+
+kernel void similarBatchError2DLossHalf(
+    const device half * outs,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch && outs && losses)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem1 = id[0];
+    uint elem2 = id[1];
+    
+    if (elem1 >= nbBatch || elem2 >= nbBatch)
+    {
+        return ;
+    }
+    
+    if (elem1 == elem2)
+    {
+        losses[elem2 + nbBatch * elem1] = 0.0;
+    }
+    else
+    {
+        float sum = 0.0;
+        for (uint i=0; i<height; i++) {
+        for (uint j=0; j<width; j++)
+        {
+            uint offset1 = j + (elem1 * height + i) * width;
+            uint offset2 = j + (elem2 * height + i) * width;
+        
+            sum += outs[offset1] * outs[offset2];
+        }}
+        losses[elem2 + nbBatch * elem1] = sum;
+    }
+}
+
+kernel void similarBatchError2DLossDerivativeHalf(
+    const device half * outs,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pCoeff && pDirty &&
+        outs && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint i = id[0] / width;
+    uint j = id[0] % width;
+    uint elem = id[1];
+    
+    if (i * j >= width * height || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint elem1=0; elem1<nbBatch; elem1++)
+    {
+        if (elem1 == elem)
+        {
+            continue;
+        }
+        uint offset1 = j + (elem1 * height + i) * width;
+        sum += 2 * outs[offset1];
+    }
+    
+    uint offset = j + (elem * height + i) * width;
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = coeff / nbBatch * sum;
+    }
+    else
+    {
+        deltaPrev[offset] += coeff / nbBatch * sum;
+    }
+}
+
+kernel void similarError2DLossDerivativeHalf(
+    const device half * outs,
+    constant uint * pGlobalOffset,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pNbBatchPrev,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    float coeff;
+    uint globalOffset;
+    uint nbBatch, nbBatchPrev;
+    uint dirty;
+    
+    if (pGlobalOffset && pNbChannels && pDimensions &&
+        pNbBatch && pNbBatchPrev && pCoeff && pDirty &&
+        outs && deltaPrev)
+    {
+        globalOffset = *pGlobalOffset;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        nbBatchPrev = *pNbBatchPrev;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint i = id[0] / width;
+    uint j = id[0] % width;
+    uint elem = id[1];
+    
+    if (i * j >= width * height || elem >= nbBatchPrev)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint elem1=0; elem1<nbBatch; elem1++)
+    {
+        if (elem1 == elem+globalOffset)
+        {
+            continue;
+        }
+        uint offset1 = j + (elem1 * height + i) * width;
+        sum += 2 * outs[offset1];
+    }
+    
+    uint offset = j + (elem * height + i) * width;
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = coeff / nbBatch * sum;
+    }
+    else
+    {
+        deltaPrev[offset] += coeff / nbBatch * sum;
+    }
+}
+
+kernel void flipHorizontal2DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pDoFlip,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint doFlip;
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pDoFlip && pNbChannels && pDimensions && pNbBatch &&
+        outsPrev && outs)
+    {
+        doFlip = *pDoFlip;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+        
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset1 = j + (offsetStart + i) * width;
+    uint offset2 = offset1;
+    if (doFlip)
+    {
+        offset2 = width-1-j + (offsetStart + i) * width;
+    }
+    
+    outs[offset1] = outsPrev[offset2];
+}
+
+kernel void flipHorizontal2DBackwardHalf(
+    const device half * delta,
+    constant uint * pDoFlip,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint doFlip;
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pDoFlip && pNbChannels && pDimensions && pNbBatch && pDirty &&
+        delta && deltaPrev)
+    {
+        doFlip = *pDoFlip;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset1 = j + (offsetStart + i) * width;
+    uint offset2 = offset1;
+    if (doFlip)
+    {
+        offset2 = width-1-j + (offsetStart + i) * width;
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset1] = delta[offset2];
+    }
+    else
+    {
+        deltaPrev[offset1] += delta[offset2];
+    }
+}
+
+kernel void flipVertical2DForwardHalf(
+    const device half * outsPrev,
+    constant uint * pDoFlip,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint doFlip;
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pDoFlip && pNbChannels && pDimensions && pNbBatch &&
+        outsPrev && outs)
+    {
+        doFlip = *pDoFlip;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+        
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset1 = j + (offsetStart + i) * width;
+    uint offset2 = offset1;
+    if (doFlip)
+    {
+        offset2 = j + (offsetStart + height-1-i) * width;
+    }
+    
+    outs[offset1] = outsPrev[offset2];
+}
+
+kernel void flipVertical2DBackwardHalf(
+    const device half * delta,
+    constant uint * pDoFlip,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint doFlip;
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pDoFlip && pNbChannels && pDimensions && pNbBatch && pDirty &&
+        delta && deltaPrev)
+    {
+        doFlip = *pDoFlip;
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset1 = j + (offsetStart + i) * width;
+    uint offset2 = offset1;
+    if (doFlip)
+    {
+        offset2 = j + (offsetStart + height-1-i) * width;
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset1] = delta[offset2];
+    }
+    else
+    {
+        deltaPrev[offset1] += delta[offset2];
+    }
+}
+
+kernel void colorJitterHSVForwardHalf(
+    const device half * outsPrev,
+    constant float * pNoise,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    float noiseH, noiseS, noiseV;
+    uint height, width;
+    uint nbBatch;
+    
+    if (pNoise && pDimensions && pNbBatch && outsPrev && outs)
+    {
+        noiseH = pNoise[0];
+        noiseS = pNoise[1];
+        noiseV = pNoise[2];
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint row = id[0] / width;
+    uint col = id[0] % width;
+    
+    if (row * col >= height * width ||
+        elem >= nbBatch)
+    {
+        return ;
+    }
+        
+    uint offsetStartR = (0 + 3 * elem) * height;
+    uint offsetStartG = (1 + 3 * elem) * height;
+    uint offsetStartB = (2 + 3 * elem) * height;
+    
+    uint offsetR = col + (offsetStartR + row) * width;
+    uint offsetG = col + (offsetStartG + row) * width;
+    uint offsetB = col + (offsetStartB + row) * width;
+    
+    float r = outsPrev[offsetR];
+    float g = outsPrev[offsetG];
+    float b = outsPrev[offsetB];
+    
+    float maxValue = max(max(r, g), b);
+    float minValue = min(min(r, g), b);
+    float delta = maxValue - minValue;
+    
+    float h;
+    if (delta == 0)
+    {
+        h = 0.0;
+    }
+    else if (maxValue == r)
+    {
+        h = (g - b) / delta;
+    }
+    else if (maxValue == g)
+    {
+        h = (g - b) / delta + 2.0;
+    }
+    else
+    {
+        h = (g - b) / delta + 4.0;
+    }
+    h *= 60.0;
+    
+    float s = 0.0;
+    if (maxValue != 0)
+    {
+        s = delta / maxValue;
+    }
+    
+    float v = maxValue;
+    
+    h += noiseH; h = max(h, 0.0); h = min(h, 360.0);
+    s += noiseS; s = max(s, 0.0); s = min(s, 1.0);
+    v += noiseV; v = max(v, 0.0); v = min(v, 1.0);
+    
+    if (s == 0.0)
+    {
+        r = v; g = v; b = v;
+    }
+    
+    float angle = h;
+    float sector = angle / 60; // Sector
+    float i = floor(sector);
+    float f = sector - i; // Factorial part of h
+    
+    float p = v * (1 - s);
+    float q = v * (1 - (s * f));
+    float t = v * (1 - (s * (1 - f)));
+    
+    if (i == 0)
+    {
+        r = v; g = t; b = p;
+    }
+    else if (i == 1)
+    {
+        r = q; g = v; b = p;
+    }
+    else if (i == 2)
+    {
+        r = p; g = v; b = t;
+    }
+    else if (i == 3)
+    {
+        r = p; g = q; b = v;
+    }
+    else if (i == 4)
+    {
+        r = t; g = p; b = v;
+    }
+    else
+    {
+        r = v; g = p; b = q;
+    }
+    
+    outs[offsetR] = r;
+    outs[offsetG] = g;
+    outs[offsetB] = b;
+}
+
+kernel void BCE2DLossHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch && outs && groundTruth && losses)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (uint i=0; i<height; i++) {
+        for (uint j=0; j<width; j++)
+        {
+            uint offset = j + (offsetStart + i) * width;
+            
+            float out = outs[offset];
+            float gt = groundTruth[offset];
+            float tmp1 = log(out);
+            float tmp2 = log(1 - out);
+            
+            tmp -= (gt * tmp1 + (1 - gt) * tmp2);
+        }}
+    }
+    
+    losses[elem] = tmp;
+}
+
+kernel void BCE2DLossDerivativeHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pCoeff && pDirty &&
+        outs && groundTruth && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float gt = groundTruth[offset];
+    float out = outs[offset];
+    float derivative = 0.0;
+    
+    if (gt == 1.0)
+    {
+        derivative = -1 / out;
+    }
+    else if (gt == 0.0)
+    {
+        derivative = 1 / (1 - out);
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = coeff * derivative /
+            float(nbBatch * nbChannels * height * width);
+    }
+    else
+    {
+        deltaPrev[offset] += coeff * derivative /
+            float(nbBatch * nbChannels * height * width);
+    }
+}
+
+kernel void BCESigmoid2DLossHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch && outs && groundTruth && losses)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (uint i=0; i<height; i++) {
+        for (uint j=0; j<width; j++)
+        {
+            uint offset = j + (offsetStart + i) * width;
+            
+            float out = outs[offset];
+            float gt = groundTruth[offset];
+            float value;
+            
+            if (out > 0)
+            {
+                value = (1 - gt) * out;
+                value += log(1 + exp(-out));
+            }
+            else
+            {
+                value = -out * gt;
+                value += log(exp(out) + 1);
+            }
+            
+            tmp += value;
+        }}
+    }
+    
+    losses[elem] = tmp;
+}
+
+kernel void BCESigmoid2DLossDerivativeHalf(
+    const device half * outs,
+    const device half * groundTruth,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    float coeff;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pNbBatch && pCoeff && pDirty &&
+        outs && groundTruth && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    float gt = groundTruth[offset];
+    float out = outs[offset];
+    float value;
+    
+    if (out >= 0)
+    {
+        value = 1.0 / (1.0 + exp(-out));
+    }
+    else
+    {
+        value = exp(out) / (1.0 + exp(out));
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = coeff * (value - gt) /
+            float(nbBatch * nbChannels * height * width);
+    }
+    else
+    {
+        deltaPrev[offset] += coeff * (value - gt) /
+            float(nbBatch * nbChannels * height * width);
+    }
+}
+
+kernel void layerCAM2DForwardHalf(
+    const device half * outsPrev,
+    const device half * deltaPrev,
+    constant uint * pNbChannelsPrev,
+    constant uint * pDimensions,
+    constant uint * pKeepPositive,
+    constant uint * pNbBatch,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbBatch;
+    uint nbChannelsPrev;
+    uint keepPositive;
+    
+    if (pNbChannelsPrev && pDimensions && pKeepPositive && pNbBatch &&
+        outsPrev && outs)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannelsPrev = *pNbChannelsPrev;
+        keepPositive = *pKeepPositive;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint i = id[0] / width;
+    uint j = id[0] % width;
+    
+    if (i * j >= height * width || elem >= nbBatch)
+    {
+        return ;
+    }
+        
+    float sum = 0.0;
+    for (uint depthPrev=0; depthPrev<nbChannelsPrev; depthPrev++)
+    {
+        uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * height;
+        uint offsetPrev = j + (offsetStartPrev + i) * width;
+        
+        float deltaPrevTmp = deltaPrev[offsetPrev];
+        if (!keepPositive)
+        {
+            deltaPrevTmp = -deltaPrevTmp;
+        }
+        if (deltaPrevTmp < 0)
+        {
+            deltaPrevTmp = 0.0;
+        }
+        
+        sum += deltaPrevTmp * outsPrev[offsetPrev];
+    }
+    
+    uint offset = j + (elem * height + i) * width;
+    outs[offset] = sum;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/LayerMerge.metal b/Sources/GrAIdient/Metal/Kernel/LayerMergeFloat.metal
similarity index 93%
rename from Sources/GrAIdient/Metal/Kernel/LayerMerge.metal
rename to Sources/GrAIdient/Metal/Kernel/LayerMergeFloat.metal
index 3e2edb9c..4642972b 100644
--- a/Sources/GrAIdient/Metal/Kernel/LayerMerge.metal
+++ b/Sources/GrAIdient/Metal/Kernel/LayerMergeFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void sum1(
+kernel void sum1Float(
     const device float * ins,
     constant uint * pNbElems,
     device float * outs,
@@ -31,7 +31,7 @@ kernel void sum1(
     outs[id] = ins[id];
 }
 
-kernel void sum14(
+kernel void sum14Float(
     const device float4 * ins,
     constant uint * pNbElems,
     device float4 * outs,
@@ -54,7 +54,7 @@ kernel void sum14(
     outs[id] = ins[id];
 }
 
-kernel void sum2(
+kernel void sum2Float(
     const device float * ins,
     constant uint * pNbElems,
     device float * outs,
@@ -77,7 +77,7 @@ kernel void sum2(
     outs[id] += ins[id];
 }
 
-kernel void sum24(
+kernel void sum24Float(
     const device float4 * ins,
     constant uint * pNbElems,
     device float4 * outs,
@@ -100,7 +100,7 @@ kernel void sum24(
     outs[id] += ins[id];
 }
 
-kernel void multiplyForward(
+kernel void multiplyForwardFloat(
     const device float * outsPrev,
     constant uint * pNbElems,
     device float * outs,
@@ -123,7 +123,7 @@ kernel void multiplyForward(
     outs[id] *= outsPrev[id];
 }
 
-kernel void multiplyBackward(
+kernel void multiplyBackwardFloat(
     const device float * outs,
     const device float * delta,
     constant uint * pNbElems,
diff --git a/Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal b/Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal
new file mode 100644
index 00000000..d3ca0403
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal
@@ -0,0 +1,161 @@
+//
+// LayerMerge.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void sum1Half(
+    const device half * ins,
+    constant uint * pNbElems,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && ins && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] = ins[id];
+}
+
+kernel void sum14Half(
+    const device half4 * ins,
+    constant uint * pNbElems,
+    device half4 * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && ins && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id * 4 >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] = ins[id];
+}
+
+kernel void sum2Half(
+    const device half * ins,
+    constant uint * pNbElems,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && ins && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] += ins[id];
+}
+
+kernel void sum24Half(
+    const device half4 * ins,
+    constant uint * pNbElems,
+    device half4 * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && ins && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id * 4 >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] += ins[id];
+}
+
+kernel void multiplyForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbElems,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && outsPrev && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] *= outsPrev[id];
+}
+
+kernel void multiplyBackwardHalf(
+    const device half * outs,
+    const device half * delta,
+    constant uint * pNbElems,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    uint dirty;
+    
+    if (pNbElems && pDirty && outs && delta && deltaPrev)
+    {
+        nbElems = pNbElems[0];
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float tmp = outs[id];
+    float deltaCur = delta[id];
+    
+    if (dirty)
+    {
+        deltaPrev[id] = deltaCur * tmp;
+    }
+    else
+    {
+        deltaPrev[id] += deltaCur * tmp;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/LayerNorm.metal b/Sources/GrAIdient/Metal/Kernel/LayerNormFloat.metal
similarity index 96%
rename from Sources/GrAIdient/Metal/Kernel/LayerNorm.metal
rename to Sources/GrAIdient/Metal/Kernel/LayerNormFloat.metal
index 7049fea2..51a25688 100644
--- a/Sources/GrAIdient/Metal/Kernel/LayerNorm.metal
+++ b/Sources/GrAIdient/Metal/Kernel/LayerNormFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void computeLayerNormSeqμ(
+kernel void computeLayerNormSeqμFloat(
     const device float * tmps,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -48,7 +48,7 @@ kernel void computeLayerNormSeqμ(
     μ[seq + sequence * elem] = sum / nbElems;
 }
 
-kernel void computeLayerNormSeqμ4(
+kernel void computeLayerNormSeqμ4Float(
     const device float4 * tmps,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -89,7 +89,7 @@ kernel void computeLayerNormSeqμ4(
     μ[seq + sequence * elem] = (sum[0] + sum[1] + sum[2] + sum[3]) / nbElems;
 }
 
-kernel void computeLayerNormSeqσ2(
+kernel void computeLayerNormSeqσ2Float(
     const device float * tmps,
     const device float * μ,
     constant uint * pNbNeurons,
@@ -132,7 +132,7 @@ kernel void computeLayerNormSeqσ2(
     σ2[seq + sequence * elem] = sum / nbElems;
 }
 
-kernel void computeLayerNormSeqσ24(
+kernel void computeLayerNormSeqσ24Float(
     const device float4 * tmps,
     const device float * μ,
     constant uint * pNbNeurons,
@@ -176,7 +176,7 @@ kernel void computeLayerNormSeqσ24(
     σ2[seq + sequence * elem] = (sum[0] + sum[1] + sum[2] + sum[3]) / nbElems;
 }
 
-kernel void forwardLayerNormSeq(
+kernel void forwardLayerNormSeqFloat(
     const device float * β,
     const device float * Ɣ,
     const device float * μ,
@@ -221,7 +221,7 @@ kernel void forwardLayerNormSeq(
     tmps[offset] = Ɣ[depth] * xhat + β[depth];
 }
 
-kernel void forwardLayerNormSeq4(
+kernel void forwardLayerNormSeq4Float(
     const device float4 * β,
     const device float4 * Ɣ,
     const device float * μ,
@@ -267,7 +267,7 @@ kernel void forwardLayerNormSeq4(
     tmps[offset] = Ɣ[depth] * xhat + β[depth];
 }
 
-kernel void backwardWeights1LayerNormSeq(
+kernel void backwardWeights1LayerNormSeqFloat(
     const device float * delta,
     const device float * xHat,
     const device float * Ɣ,
@@ -316,7 +316,7 @@ kernel void backwardWeights1LayerNormSeq(
     sum2[seq + sequence * elem] = tmp2;
 }
 
-kernel void backwardWeights1LayerNormSeq4(
+kernel void backwardWeights1LayerNormSeq4Float(
     const device float4 * delta,
     const device float4 * xHat,
     const device float4 * Ɣ,
@@ -365,7 +365,7 @@ kernel void backwardWeights1LayerNormSeq4(
     sum2[seq + sequence * elem] = tmp2[0] + tmp2[1] + tmp2[2] + tmp2[3];
 }
 
-kernel void backwardWeights2LayerNormSeq(
+kernel void backwardWeights2LayerNormSeqFloat(
     const device float * delta,
     const device float * xHat,
     constant uint * pNbNeurons,
@@ -424,7 +424,7 @@ kernel void backwardWeights2LayerNormSeq(
     }
 }
 
-kernel void backwardWeights2LayerNormSeq4(
+kernel void backwardWeights2LayerNormSeq4Float(
     const device float4 * delta,
     const device float4 * xHat,
     constant uint * pNbNeurons,
@@ -483,7 +483,7 @@ kernel void backwardWeights2LayerNormSeq4(
     }
 }
 
-kernel void backwardLayerNormSeq(
+kernel void backwardLayerNormSeqFloat(
     const device float * σ2,
     const device float * xHat,
     const device float * Ɣ,
@@ -532,7 +532,7 @@ kernel void backwardLayerNormSeq(
     delta[offset] = mult * (tmp1 - tmp2 - tmp3);
 }
 
-kernel void backwardLayerNormSeq4(
+kernel void backwardLayerNormSeq4Float(
     const device float * σ2,
     const device float4 * xHat,
     const device float4 * Ɣ,
diff --git a/Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal b/Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal
new file mode 100644
index 00000000..cfecfa0f
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal
@@ -0,0 +1,583 @@
+//
+// LayerNorm.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 09/03/2023.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void computeLayerNormSeqμHalf(
+    const device half * tmps,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * μ,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && tmps && μ)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    if (elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint nbElems = nbNeurons;
+    float sum = 0.0;
+    
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        sum += tmps[offset];
+    }
+    
+    μ[seq + sequence * elem] = sum / nbElems;
+}
+
+kernel void computeLayerNormSeqμ4Half(
+    const device half4 * tmps,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * μ,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && tmps && μ)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    if (elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint nbElems = nbNeurons;
+    half4 sum = 0.0;
+    
+    for (uint depth=0; depth<nbNeurons/4; depth++)
+    {
+        uint offset =
+            (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        sum += tmps[offset];
+    }
+    
+    μ[seq + sequence * elem] = (sum[0] + sum[1] + sum[2] + sum[3]) / nbElems;
+}
+
+kernel void computeLayerNormSeqσ2Half(
+    const device half * tmps,
+    const device half * μ,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * σ2,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && tmps && μ && σ2)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    if (elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint nbElems = nbNeurons;
+    float sum = 0.0;
+    
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        
+        float tmp = tmps[offset] - μ[seq + sequence * elem];
+        sum += tmp * tmp;
+    }
+    
+    σ2[seq + sequence * elem] = sum / nbElems;
+}
+
+kernel void computeLayerNormSeqσ24Half(
+    const device half4 * tmps,
+    const device half * μ,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * σ2,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && tmps && μ && σ2)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    if (elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint nbElems = nbNeurons;
+    half4 sum = 0.0;
+    
+    for (uint depth=0; depth<nbNeurons/4; depth++)
+    {
+        uint offset =
+            (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        
+        half4 tmp = tmps[offset] - μ[seq + sequence * elem];
+        sum += tmp * tmp;
+    }
+    
+    σ2[seq + sequence * elem] = (sum[0] + sum[1] + sum[2] + sum[3]) / nbElems;
+}
+
+kernel void forwardLayerNormSeqHalf(
+    const device half * β,
+    const device half * Ɣ,
+    const device half * μ,
+    const device half * σ2,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * tmps,
+    device half * xHat,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    float Ɛ = 1e-5;
+    
+    if (pNbNeurons && pNbBatch && pSequence && β && Ɣ &&
+        tmps && xHat && μ && σ2)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    
+    float tmp1 = tmps[offset] - μ[seq + sequence * elem];
+    float tmp2 = sqrt(σ2[seq + sequence * elem] + Ɛ);
+    float xhat = tmp1 / tmp2;
+    xHat[offset] = xhat;
+    tmps[offset] = Ɣ[depth] * xhat + β[depth];
+}
+
+kernel void forwardLayerNormSeq4Half(
+    const device half4 * β,
+    const device half4 * Ɣ,
+    const device half * μ,
+    const device half * σ2,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * tmps,
+    device half4 * xHat,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    float Ɛ = 1e-5;
+    
+    if (pNbNeurons && pNbBatch && pSequence && β && Ɣ &&
+        tmps && xHat && μ && σ2)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset =
+        (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+    
+    half4 tmp1 = tmps[offset] - μ[seq + sequence * elem];
+    float tmp2 = sqrt(σ2[seq + sequence * elem] + Ɛ);
+    half4 xhat = tmp1 / tmp2;
+    xHat[offset] = xhat;
+    tmps[offset] = Ɣ[depth] * xhat + β[depth];
+}
+
+kernel void backwardWeights1LayerNormSeqHalf(
+    const device half * delta,
+    const device half * xHat,
+    const device half * Ɣ,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * sum1,
+    device half * sum2,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence &&
+        delta && xHat && Ɣ && sum1 && sum2)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    if (elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp1 = 0.0, tmp2 = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++)
+    {
+        uint offset = depth +
+            nbNeurons * seq + sequence * nbNeurons * elem;
+        
+        float deltaTmp = delta[offset];
+        float xHatTmp = xHat[offset];
+        float dxHat = Ɣ[depth] * deltaTmp;
+        tmp1 += dxHat;
+        tmp2 += dxHat * xHatTmp;
+    }
+    
+    sum1[seq + sequence * elem] = tmp1;
+    sum2[seq + sequence * elem] = tmp2;
+}
+
+kernel void backwardWeights1LayerNormSeq4Half(
+    const device half4 * delta,
+    const device half4 * xHat,
+    const device half4 * Ɣ,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * sum1,
+    device half * sum2,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence &&
+        delta && xHat && Ɣ && sum1 && sum2)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    if (elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp1 = 0.0, tmp2 = 0.0;
+    for (uint depth=0; depth<nbNeurons/4; depth++)
+    {
+        uint offset = (depth * 4 +
+            nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        
+        half4 deltaTmp = delta[offset];
+        half4 xHatTmp = xHat[offset];
+        half4 dxHat = Ɣ[depth] * deltaTmp;
+        tmp1 += dxHat;
+        tmp2 += dxHat * xHatTmp;
+    }
+    
+    sum1[seq + sequence * elem] = tmp1[0] + +tmp1[1] + tmp1[2] + tmp1[3];
+    sum2[seq + sequence * elem] = tmp2[0] + tmp2[1] + tmp2[2] + tmp2[3];
+}
+
+kernel void backwardWeights2LayerNormSeqHalf(
+    const device half * delta,
+    const device half * xHat,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half * dƔ,
+    device half * dβ,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pSequence && pAccumulate &&
+        delta && xHat&& dƔ && dβ)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth >= nbNeurons)
+    {
+        return ;
+    }
+    
+    float tmp1 = 0.0, tmp2 = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset = depth +
+            nbNeurons * seq + sequence * nbNeurons * elem;
+        
+        float deltaTmp = delta[offset];
+        float xHatTmp = xHat[offset];
+        
+        tmp1 += deltaTmp * xHatTmp;
+        tmp2 += deltaTmp;
+    }}
+    
+    if (accumulate)
+    {
+        dƔ[depth] += tmp1;
+        dβ[depth] += tmp2;
+    }
+    else
+    {
+        dƔ[depth] = tmp1;
+        dβ[depth] = tmp2;
+    }
+}
+
+kernel void backwardWeights2LayerNormSeq4Half(
+    const device half4 * delta,
+    const device half4 * xHat,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half4 * dƔ,
+    device half4 * dβ,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pSequence && pAccumulate &&
+        delta && xHat&& dƔ && dβ)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id;
+    if (depth * 4 >= nbNeurons)
+    {
+        return ;
+    }
+    
+    half4 tmp1 = 0.0, tmp2 = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offset = (depth * 4 +
+            nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        
+        half4 deltaTmp = delta[offset];
+        half4 xHatTmp = xHat[offset];
+        
+        tmp1 += deltaTmp * xHatTmp;
+        tmp2 += deltaTmp;
+    }}
+    
+    if (accumulate)
+    {
+        dƔ[depth] += tmp1;
+        dβ[depth] += tmp2;
+    }
+    else
+    {
+        dƔ[depth] = tmp1;
+        dβ[depth] = tmp2;
+    }
+}
+
+kernel void backwardLayerNormSeqHalf(
+    const device half * σ2,
+    const device half * xHat,
+    const device half * Ɣ,
+    const device half * sum1,
+    const device half * sum2,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * delta,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    float Ɛ = 1e-5;
+    
+    if (pNbNeurons && pNbBatch && pSequence &&
+        σ2 && xHat && Ɣ && sum1 && sum2 && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    uint nbElems = nbNeurons;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    
+    float mult =
+        1.0 / ((float)nbElems * sqrt(σ2[seq + sequence * elem] + Ɛ));
+    float dxHat = Ɣ[depth] * delta[offset];
+    float tmp1 = nbElems * dxHat;
+    float tmp2 = sum1[seq + sequence * elem];
+    float tmp3 = xHat[offset] * sum2[seq + sequence * elem];
+    
+    delta[offset] = mult * (tmp1 - tmp2 - tmp3);
+}
+
+kernel void backwardLayerNormSeq4Half(
+    const device half * σ2,
+    const device half4 * xHat,
+    const device half4 * Ɣ,
+    const device half * sum1,
+    const device half * sum2,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * delta,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    float Ɛ = 1e-5;
+    
+    if (pNbNeurons && pNbBatch && pSequence &&
+        σ2 && xHat && Ɣ && sum1 && sum2 && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    uint nbElems = nbNeurons;
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset =
+        (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+    
+    float mult =
+        1.0 / ((float)nbElems * sqrt(σ2[seq + sequence * elem] + Ɛ));
+    half4 dxHat = Ɣ[depth] * delta[offset];
+    half4 tmp1 = nbElems * dxHat;
+    float tmp2 = sum1[seq + sequence * elem];
+    half4 tmp3 = xHat[offset] * sum2[seq + sequence * elem];
+    
+    delta[offset] = mult * (tmp1 - tmp2 - tmp3);
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal b/Sources/GrAIdient/Metal/Kernel/LayerSeqFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/LayerSeq.metal
rename to Sources/GrAIdient/Metal/Kernel/LayerSeqFloat.metal
index a5957708..b0bcfb3c 100644
--- a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal
+++ b/Sources/GrAIdient/Metal/Kernel/LayerSeqFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void avgPoolSeqForward(
+kernel void avgPoolSeqForwardFloat(
     const device float * outsPrev,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -50,7 +50,7 @@ kernel void avgPoolSeqForward(
     outs[offset] = tmp;
 }
 
-kernel void avgPoolSeqBackward(
+kernel void avgPoolSeqBackwardFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -98,7 +98,7 @@ kernel void avgPoolSeqBackward(
     }
 }
 
-kernel void selectSeqForward(
+kernel void selectSeqForwardFloat(
     const device float * outsPrev,
     constant uint * pNbNeurons,
     constant uint * pTargetSeq,
@@ -137,7 +137,7 @@ kernel void selectSeqForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void selectSeqBackward(
+kernel void selectSeqBackwardFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pTargetSeq,
@@ -176,7 +176,7 @@ kernel void selectSeqBackward(
     deltaPrev[offsetPrev] += delta[offset];
 }
 
-kernel void concat1SeqForward(
+kernel void concat1SeqForwardFloat(
     const device float * outsPrev,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -221,7 +221,7 @@ kernel void concat1SeqForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void concat1Seq4Forward(
+kernel void concat1Seq4ForwardFloat(
     const device float4 * outsPrev,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -266,7 +266,7 @@ kernel void concat1Seq4Forward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void concat1SeqBackward(
+kernel void concat1SeqBackwardFloat(
     const device float * delta,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -321,7 +321,7 @@ kernel void concat1SeqBackward(
     }
 }
 
-kernel void concat1Seq4Backward(
+kernel void concat1Seq4BackwardFloat(
     const device float4 * delta,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -376,7 +376,7 @@ kernel void concat1Seq4Backward(
     }
 }
 
-kernel void concat2SeqForward(
+kernel void concat2SeqForwardFloat(
     const device float * outsPrev,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -421,7 +421,7 @@ kernel void concat2SeqForward(
     outs[offset] = outsPrev[offsetPrev];
 }
 
-kernel void concat2SeqBackward(
+kernel void concat2SeqBackwardFloat(
     const device float * delta,
     constant uint * pGlobalOffset,
     constant uint * pNbNeurons,
@@ -476,7 +476,7 @@ kernel void concat2SeqBackward(
     }
 }
 
-kernel void constant12SeqForward(
+kernel void constant12SeqForwardFloat(
     const device float * weights,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -510,7 +510,7 @@ kernel void constant12SeqForward(
     outs[offset] = weights[depth + nbNeurons * seq];
 }
 
-kernel void constant12Seq4Forward(
+kernel void constant12Seq4ForwardFloat(
     const device float4 * weights,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -545,7 +545,7 @@ kernel void constant12Seq4Forward(
     outs[offset] = weights[(depth * 4 + nbNeurons * seq) / 4];
 }
 
-kernel void constant12SeqBackward(
+kernel void constant12SeqBackwardFloat(
     const device float * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -593,7 +593,7 @@ kernel void constant12SeqBackward(
     }
 }
 
-kernel void constant12Seq4Backward(
+kernel void constant12Seq4BackwardFloat(
     const device float4 * delta,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -642,7 +642,7 @@ kernel void constant12Seq4Backward(
     }
 }
 
-kernel void constant2SeqForward(
+kernel void constant2SeqForwardFloat(
     const device float * weights,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -676,7 +676,7 @@ kernel void constant2SeqForward(
     outs[offset] = weights[depth];
 }
 
-kernel void constant2Seq4Forward(
+kernel void constant2Seq4ForwardFloat(
     const device float4 * weights,
     constant uint * pNbNeurons,
     constant uint * pNbBatch,
@@ -711,7 +711,7 @@ kernel void constant2Seq4Forward(
     outs[offset] = weights[depth];
 }
 
-kernel void querySeqForward(
+kernel void querySeqForwardFloat(
     const device float * query,
     const device float * key,
     constant uint * pNbHeads,
@@ -772,7 +772,7 @@ kernel void querySeqForward(
     outs[offset] = tmp;
 }
 
-kernel void querySeq4Forward(
+kernel void querySeq4ForwardFloat(
     const device float4 * query,
     const device float4 * key,
     constant uint * pNbHeads,
@@ -833,7 +833,7 @@ kernel void querySeq4Forward(
     outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
 }
 
-kernel void queryQuerySeqBackward(
+kernel void queryQuerySeqBackwardFloat(
     const device float * delta,
     const device float * key,
     constant uint * pNbHeads,
@@ -905,7 +905,7 @@ kernel void queryQuerySeqBackward(
     }
 }
 
-kernel void queryQuerySeq4Backward(
+kernel void queryQuerySeq4BackwardFloat(
     const device float * delta,
     const device float4 * key,
     constant uint * pNbHeads,
@@ -977,7 +977,7 @@ kernel void queryQuerySeq4Backward(
     }
 }
 
-kernel void queryKeySeqBackward(
+kernel void queryKeySeqBackwardFloat(
     const device float * delta,
     const device float * query,
     constant uint * pNbHeads,
@@ -1049,7 +1049,7 @@ kernel void queryKeySeqBackward(
     }
 }
 
-kernel void queryKeySeq4Backward(
+kernel void queryKeySeq4BackwardFloat(
     const device float * delta,
     const device float4 * query,
     constant uint * pNbHeads,
@@ -1121,7 +1121,7 @@ kernel void queryKeySeq4Backward(
     }
 }
 
-kernel void querySelfSeqForward(
+kernel void querySelfSeqForwardFloat(
     const device float * outsPrev,
     constant uint * pNbHeads,
     constant uint * pNbNeurons,
@@ -1191,7 +1191,7 @@ kernel void querySelfSeqForward(
     outs[offset] = tmp;
 }
 
-kernel void querySelfSeq4Forward(
+kernel void querySelfSeq4ForwardFloat(
     const device float4 * outsPrev,
     constant uint * pNbHeads,
     constant uint * pNbNeurons,
@@ -1261,7 +1261,7 @@ kernel void querySelfSeq4Forward(
     outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
 }
 
-kernel void querySelfQuerySeqBackward(
+kernel void querySelfQuerySeqBackwardFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbHeads,
@@ -1342,7 +1342,7 @@ kernel void querySelfQuerySeqBackward(
     }
 }
 
-kernel void querySelfQuerySeq4Backward(
+kernel void querySelfQuerySeq4BackwardFloat(
     const device float4 * outsPrev,
     const device float * delta,
     constant uint * pNbHeads,
@@ -1423,7 +1423,7 @@ kernel void querySelfQuerySeq4Backward(
     }
 }
 
-kernel void querySelfKeySeqBackward(
+kernel void querySelfKeySeqBackwardFloat(
     const device float * outsPrev,
     const device float * delta,
     constant uint * pNbHeads,
@@ -1504,7 +1504,7 @@ kernel void querySelfKeySeqBackward(
     }
 }
 
-kernel void querySelfKeySeq4Backward(
+kernel void querySelfKeySeq4BackwardFloat(
     const device float4 * outsPrev,
     const device float * delta,
     constant uint * pNbHeads,
@@ -1585,7 +1585,7 @@ kernel void querySelfKeySeq4Backward(
     }
 }
 
-kernel void softmaxSeqForward(
+kernel void softmaxSeqForwardFloat(
     const device float * outsPrev,
     constant uint * pNbHeads,
     constant uint * pNbNeurons,
@@ -1651,7 +1651,7 @@ kernel void softmaxSeqForward(
     outs[offset] = exp(outPrev - cMax) / sum1;
 }
 
-kernel void softmaxSeq4Forward(
+kernel void softmaxSeq4ForwardFloat(
     const device float4 * outsPrev,
     constant uint * pNbHeads,
     constant uint * pNbNeurons,
@@ -1723,7 +1723,7 @@ kernel void softmaxSeq4Forward(
     outs[offset] = exp(outPrev - cMax) / sum2;
 }
 
-kernel void softmaxSeqBackward(
+kernel void softmaxSeqBackwardFloat(
     const device float * outs,
     const device float * delta,
     constant uint * pNbHeads,
@@ -1789,7 +1789,7 @@ kernel void softmaxSeqBackward(
     }
 }
 
-kernel void softmaxSeq4Backward(
+kernel void softmaxSeq4BackwardFloat(
     const device float4 * outs,
     const device float4 * delta,
     constant uint * pNbHeads,
@@ -1857,7 +1857,7 @@ kernel void softmaxSeq4Backward(
     }
 }
 
-kernel void valueSeqForward(
+kernel void valueSeqForwardFloat(
     const device float * value,
     const device float * score,
     constant uint * pNbHeads,
@@ -1915,7 +1915,7 @@ kernel void valueSeqForward(
     outs[offset] = tmp;
 }
 
-kernel void valueSeq4Forward(
+kernel void valueSeq4ForwardFloat(
     const device float4 * value,
     const device float * score,
     constant uint * pNbHeads,
@@ -1973,7 +1973,7 @@ kernel void valueSeq4Forward(
     outs[offset] = tmp;
 }
 
-kernel void valueValueSeqBackward(
+kernel void valueValueSeqBackwardFloat(
     const device float * delta,
     const device float * score,
     constant uint * pNbHeads,
@@ -2042,7 +2042,7 @@ kernel void valueValueSeqBackward(
     }
 }
 
-kernel void valueValueSeq4Backward(
+kernel void valueValueSeq4BackwardFloat(
     const device float4 * delta,
     const device float * score,
     constant uint * pNbHeads,
@@ -2113,7 +2113,7 @@ kernel void valueValueSeq4Backward(
     }
 }
 
-kernel void valueScoreSeqBackward(
+kernel void valueScoreSeqBackwardFloat(
     const device float * delta,
     const device float * value,
     constant uint * pNbHeads,
@@ -2184,7 +2184,7 @@ kernel void valueScoreSeqBackward(
     }
 }
 
-kernel void valueScoreSeq4Backward(
+kernel void valueScoreSeq4BackwardFloat(
     const device float4 * delta,
     const device float4 * value,
     constant uint * pNbHeads,
@@ -2256,7 +2256,7 @@ kernel void valueScoreSeq4Backward(
     }
 }
 
-kernel void valueSelfSeqForward(
+kernel void valueSelfSeqForwardFloat(
     const device float * value,
     const device float * score,
     constant uint * pNbHeads,
@@ -2323,7 +2323,7 @@ kernel void valueSelfSeqForward(
     outs[offset] = tmp;
 }
 
-kernel void valueSelfSeq4Forward(
+kernel void valueSelfSeq4ForwardFloat(
     const device float4 * value,
     const device float * score,
     constant uint * pNbHeads,
@@ -2391,7 +2391,7 @@ kernel void valueSelfSeq4Forward(
     outs[offset] = tmp;
 }
 
-kernel void valueSelfValueSeqBackward(
+kernel void valueSelfValueSeqBackwardFloat(
     const device float * delta,
     const device float * score,
     constant uint * pNbHeads,
@@ -2459,7 +2459,7 @@ kernel void valueSelfValueSeqBackward(
     value[offsetValue] += tmp;
 }
 
-kernel void valueSelfValueSeq4Backward(
+kernel void valueSelfValueSeq4BackwardFloat(
     const device float4 * delta,
     const device float * score,
     constant uint * pNbHeads,
@@ -2528,7 +2528,7 @@ kernel void valueSelfValueSeq4Backward(
     value[offsetValue] += tmp;
 }
 
-kernel void valueSelfScoreSeqBackward(
+kernel void valueSelfScoreSeqBackwardFloat(
     const device float * delta,
     const device float * value,
     constant uint * pNbHeads,
@@ -2607,7 +2607,7 @@ kernel void valueSelfScoreSeqBackward(
     }
 }
 
-kernel void valueSelfScoreSeq4Backward(
+kernel void valueSelfScoreSeq4BackwardFloat(
     const device float4 * delta,
     const device float4 * value,
     constant uint * pNbHeads,
@@ -2687,7 +2687,7 @@ kernel void valueSelfScoreSeq4Backward(
     }
 }
 
-kernel void layerCAMSeqForward(
+kernel void layerCAMSeqForwardFloat(
     const device float * outsPrev,
     const device float * deltaPrev,
     constant uint * pNbNeuronsPrev,
diff --git a/Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal b/Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal
new file mode 100644
index 00000000..bc1c1bed
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal
@@ -0,0 +1,2745 @@
+//
+// LayerSeq.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 27/02/2023.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void avgPoolSeqForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence &&
+        outsPrev && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        uint offsetPrev = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        tmp += outsPrev[offsetPrev];
+    }
+    tmp /= sequence;
+    
+    uint offset = depth + nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void avgPoolSeqBackwardHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint dirty;
+    
+    if (pNbNeurons && pNbBatch && pSequence && pDirty &&
+        delta && deltaPrev)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    float deltaCur = delta[offset];
+    
+    uint offsetPrev = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = deltaCur / sequence;
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += deltaCur / sequence;
+    }
+}
+
+kernel void selectSeqForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbNeurons,
+    constant uint * pTargetSeq,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint targetSeq;
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pTargetSeq && pNbNeurons && pNbBatch && pSequence &&
+        outsPrev && outs)
+    {
+        targetSeq = *pTargetSeq;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    uint offsetPrev = depth +
+        nbNeurons * targetSeq + sequence * nbNeurons * elem;
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void selectSeqBackwardHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pTargetSeq,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint targetSeq;
+    
+    if (pNbNeurons && pTargetSeq && pNbBatch && pSequence &&
+        deltaPrev && delta)
+    {
+        targetSeq = *pTargetSeq;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1];
+    
+    if (depth >= nbNeurons || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * elem;
+    uint offsetPrev = depth +
+        nbNeurons * targetSeq + sequence * nbNeurons * elem;
+    deltaPrev[offsetPrev] += delta[offset];
+}
+
+kernel void concat1SeqForwardHalf(
+    const device half * outsPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pSequencePrev,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint sequencePrev;
+    uint globalOffset;
+    
+    if (pGlobalOffset && pNbNeurons &&
+        pNbBatch && pSequence && pSequencePrev && outsPrev && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        sequencePrev = *pSequencePrev;
+        globalOffset = *pGlobalOffset;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequencePrev;
+    uint seq = id[1] % sequencePrev;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequencePrev)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = depth +
+        nbNeurons * seq + sequencePrev * nbNeurons * elem;
+    uint offset = depth +
+        nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void concat1Seq4ForwardHalf(
+    const device half4 * outsPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pSequencePrev,
+    device half4 * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint sequencePrev;
+    uint globalOffset;
+    
+    if (pGlobalOffset && pNbNeurons &&
+        pNbBatch && pSequence && pSequencePrev && outsPrev && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        sequencePrev = *pSequencePrev;
+        globalOffset = *pGlobalOffset;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequencePrev;
+    uint seq = id[1] % sequencePrev;
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequencePrev)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = (depth * 4 +
+        nbNeurons * seq + sequencePrev * nbNeurons * elem) / 4;
+    uint offset = (depth * 4 +
+        nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem) / 4;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void concat1SeqBackwardHalf(
+    const device half * delta,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pSequencePrev,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint sequencePrev;
+    uint globalOffset;
+    uint dirty;
+    
+    if (pGlobalOffset && pNbNeurons &&
+        pNbBatch && pSequence && pSequencePrev && pDirty && deltaPrev && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        sequencePrev = *pSequencePrev;
+        globalOffset = *pGlobalOffset;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequencePrev;
+    uint seq = id[1] % sequencePrev;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequencePrev)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = depth +
+        nbNeurons * seq + sequencePrev * nbNeurons * elem;
+    uint offset = depth +
+        nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void concat1Seq4BackwardHalf(
+    const device half4 * delta,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pSequencePrev,
+    constant uint * pDirty,
+    device half4 * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint sequencePrev;
+    uint globalOffset;
+    uint dirty;
+    
+    if (pGlobalOffset && pNbNeurons &&
+        pNbBatch && pSequence && pSequencePrev && pDirty && deltaPrev && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        sequencePrev = *pSequencePrev;
+        globalOffset = *pGlobalOffset;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequencePrev;
+    uint seq = id[1] % sequencePrev;
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequencePrev)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = (depth * 4 +
+        nbNeurons * seq + sequencePrev * nbNeurons * elem) / 4;
+    uint offset = (depth * 4 +
+        nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem) / 4;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void concat2SeqForwardHalf(
+    const device half * outsPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint globalOffset;
+    
+    if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && outsPrev && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        globalOffset = *pGlobalOffset;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeuronsPrev || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = depth +
+        nbNeuronsPrev * seq + sequence * nbNeuronsPrev * elem;
+    uint offset = globalOffset+depth +
+        nbNeurons * seq + sequence * nbNeurons * elem;
+    
+    outs[offset] = outsPrev[offsetPrev];
+}
+
+kernel void concat2SeqBackwardHalf(
+    const device half * delta,
+    constant uint * pGlobalOffset,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint globalOffset;
+    uint dirty;
+    
+    if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty && deltaPrev && delta)
+    {
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        globalOffset = *pGlobalOffset;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeuronsPrev || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offsetPrev = depth +
+        nbNeuronsPrev * seq + sequence * nbNeuronsPrev * elem;
+    uint offset = globalOffset+depth +
+        nbNeurons * seq + sequence * nbNeurons * elem;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetPrev] = delta[offset];
+    }
+    else
+    {
+        deltaPrev[offsetPrev] += delta[offset];
+    }
+}
+
+kernel void constant12SeqForwardHalf(
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && weights && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    outs[offset] = weights[depth + nbNeurons * seq];
+}
+
+kernel void constant12Seq4ForwardHalf(
+    const device half4 * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && weights && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset =
+        (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+    outs[offset] = weights[(depth * 4 + nbNeurons * seq) / 4];
+}
+
+kernel void constant12SeqBackwardHalf(
+    const device half * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pSequence && pAccumulate && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint seq = id[1];
+    if (depth >= nbNeurons || seq >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+        tmp += delta[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[depth + nbNeurons * seq] += tmp;
+    }
+    else
+    {
+        grads[depth + nbNeurons * seq] = tmp;
+    }
+}
+
+kernel void constant12Seq4BackwardHalf(
+    const device half4 * delta,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pAccumulate,
+    device half4 * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint accumulate;
+    
+    if (pNbNeurons && pNbBatch && pSequence && pAccumulate && delta && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint seq = id[1];
+    if (depth * 4 >= nbNeurons || seq >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset =
+            (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        tmp += delta[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[(depth * 4 + nbNeurons * seq) / 4] += tmp;
+    }
+    else
+    {
+        grads[(depth * 4 + nbNeurons * seq) / 4] = tmp;
+    }
+}
+
+kernel void constant2SeqForwardHalf(
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && weights && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    outs[offset] = weights[depth];
+}
+
+kernel void constant2Seq4ForwardHalf(
+    const device half4 * weights,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence && weights && outs)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset =
+        (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+    outs[offset] = weights[depth];
+}
+
+kernel void querySeqForwardHalf(
+    const device half * query,
+    const device half * key,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        query && key && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeuronsPrev / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint depthPrev = j + head * size;
+        
+        uint offsetQuery = depthPrev +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+        uint offsetKey = depthPrev +
+            nbNeuronsPrev * seqK + sequence * nbNeuronsPrev * elem;
+        
+        tmp += query[offsetQuery] * key[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offset = seqK + head * sequence +
+        nbNeurons * seqQ + sequence * nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void querySeq4ForwardHalf(
+    const device half4 * query,
+    const device half4 * key,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        query && key && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeuronsPrev / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint j=0; j<size/4; j++)
+    {
+        uint depthPrev = j * 4 + head * size;
+        
+        uint offsetQuery = (depthPrev +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem) / 4;
+        uint offsetKey = (depthPrev +
+            nbNeuronsPrev * seqK + sequence * nbNeuronsPrev * elem) / 4;
+        
+        tmp += query[offsetQuery] * key[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offset = seqK + head * sequence +
+        nbNeurons * seqQ + sequence * nbNeurons * elem;
+    outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+}
+
+kernel void queryQuerySeqBackwardHalf(
+    const device half * delta,
+    const device half * key,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * query,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        query && key && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depthPrev = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetKey = depthPrev +
+            nbNeuronsPrev * seqK + sequence * nbNeuronsPrev * elem;
+        
+        tmp += delta[offset] * key[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetQuery = depthPrev +
+        nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+    
+    if (dirty)
+    {
+        query[offsetQuery] = tmp;
+    }
+    else
+    {
+        query[offsetQuery] += tmp;
+    }
+}
+
+kernel void queryQuerySeq4BackwardHalf(
+    const device half * delta,
+    const device half4 * key,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * query,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        query && key && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depthPrev = j * 4 + head * size;
+    
+    if (head >= nbHeads || j * 4 >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetKey = (depthPrev +
+            nbNeuronsPrev * seqK + sequence * nbNeuronsPrev * elem) / 4;
+        
+        tmp += delta[offset] * key[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetQuery = (depthPrev +
+        nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem) / 4;
+    
+    if (dirty)
+    {
+        query[offsetQuery] = tmp;
+    }
+    else
+    {
+        query[offsetQuery] += tmp;
+    }
+}
+
+kernel void queryKeySeqBackwardHalf(
+    const device half * delta,
+    const device half * query,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * key,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        query && key && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depthPrev = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetQuery = depthPrev +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+        
+        tmp += delta[offset] * query[offsetQuery];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetKey = depthPrev +
+        nbNeuronsPrev * seqK + sequence * nbNeuronsPrev * elem;
+    
+    if (dirty)
+    {
+        key[offsetKey] = tmp;
+    }
+    else
+    {
+        key[offsetKey] += tmp;
+    }
+}
+
+kernel void queryKeySeq4BackwardHalf(
+    const device half * delta,
+    const device half4 * query,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * key,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        query && key && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depthPrev = j * 4 + head * size;
+    
+    if (head >= nbHeads || j * 4 >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetQuery = (depthPrev +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem) / 4;
+        
+        tmp += delta[offset] * query[offsetQuery];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetKey = (depthPrev +
+        nbNeuronsPrev * seqK + sequence * nbNeuronsPrev * elem) / 4;
+    
+    if (dirty)
+    {
+        key[offsetKey] = tmp;
+    }
+    else
+    {
+        key[offsetKey] += tmp;
+    }
+}
+
+kernel void querySelfSeqForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev1;
+    uint nbNeuronsPrev2;
+    uint nbBlocksPrev;
+    uint queryOffset, keyOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
+        outsPrev && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev1 = *pNbNeuronsPrev;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeuronsPrev2 = nbNeuronsPrev1 / nbBlocksPrev;
+        queryOffset = pGlobalOffset[0];
+        keyOffset = pGlobalOffset[1];
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeuronsPrev2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint depthPrev = j + head * size;
+        
+        uint offsetQuery = depthPrev + queryOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqQ + sequence * nbNeuronsPrev1 * elem;
+        uint offsetKey = depthPrev + keyOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqK + sequence * nbNeuronsPrev1 * elem;
+        
+        tmp += outsPrev[offsetQuery] * outsPrev[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offset = seqK + head * sequence +
+        nbNeurons * seqQ + sequence * nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void querySelfSeq4ForwardHalf(
+    const device half4 * outsPrev,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev1;
+    uint nbNeuronsPrev2;
+    uint nbBlocksPrev;
+    uint queryOffset, keyOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
+        outsPrev && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev1 = *pNbNeuronsPrev;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeuronsPrev2 = nbNeuronsPrev1 / nbBlocksPrev;
+        queryOffset = pGlobalOffset[0];
+        keyOffset = pGlobalOffset[1];
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeuronsPrev2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint j=0; j<size/4; j++)
+    {
+        uint depthPrev = j * 4 + head * size;
+        
+        uint offsetQuery = (depthPrev + queryOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqQ + sequence * nbNeuronsPrev1 * elem) / 4;
+        uint offsetKey = (depthPrev + keyOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqK + sequence * nbNeuronsPrev1 * elem) / 4;
+        
+        tmp += outsPrev[offsetQuery] * outsPrev[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offset = seqK + head * sequence +
+        nbNeurons * seqQ + sequence * nbNeurons * elem;
+    outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+}
+
+kernel void querySelfQuerySeqBackwardHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev1;
+    uint nbNeuronsPrev2;
+    uint nbBlocksPrev;
+    uint queryOffset, keyOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        outsPrev && delta && deltaPrev)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev1 = *pNbNeuronsPrev;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeuronsPrev2 = nbNeuronsPrev1 / nbBlocksPrev;
+        queryOffset = pGlobalOffset[0];
+        keyOffset = pGlobalOffset[1];
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depthPrev = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetKey = depthPrev + keyOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqK + sequence * nbNeuronsPrev1 * elem;
+        
+        tmp += delta[offset] * outsPrev[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetQuery = depthPrev + queryOffset * nbNeuronsPrev2 +
+        nbNeuronsPrev1 * seqQ + sequence * nbNeuronsPrev1 * elem;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetQuery] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetQuery] += tmp;
+    }
+}
+
+kernel void querySelfQuerySeq4BackwardHalf(
+    const device half4 * outsPrev,
+    const device half * delta,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev1;
+    uint nbNeuronsPrev2;
+    uint nbBlocksPrev;
+    uint queryOffset, keyOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        outsPrev && delta && deltaPrev)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev1 = *pNbNeuronsPrev;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeuronsPrev2 = nbNeuronsPrev1 / nbBlocksPrev;
+        queryOffset = pGlobalOffset[0];
+        keyOffset = pGlobalOffset[1];
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depthPrev = j * 4 + head * size;
+    
+    if (head >= nbHeads || j * 4 >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetKey = (depthPrev + keyOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqK + sequence * nbNeuronsPrev1 * elem) / 4;
+        
+        tmp += delta[offset] * outsPrev[offsetKey];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetQuery = (depthPrev + queryOffset * nbNeuronsPrev2 +
+        nbNeuronsPrev1 * seqQ + sequence * nbNeuronsPrev1 * elem) / 4;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetQuery] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetQuery] += tmp;
+    }
+}
+
+kernel void querySelfKeySeqBackwardHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev1;
+    uint nbNeuronsPrev2;
+    uint nbBlocksPrev;
+    uint queryOffset, keyOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        outsPrev && delta && deltaPrev)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev1 = *pNbNeuronsPrev;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeuronsPrev2 = nbNeuronsPrev1 / nbBlocksPrev;
+        queryOffset = pGlobalOffset[0];
+        keyOffset = pGlobalOffset[1];
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depthPrev = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetQuery = depthPrev + queryOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqQ + sequence * nbNeuronsPrev1 * elem;
+        
+        tmp += delta[offset] * outsPrev[offsetQuery];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetKey = depthPrev + keyOffset * nbNeuronsPrev2 +
+        nbNeuronsPrev1 * seqK + sequence * nbNeuronsPrev1 * elem;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetKey] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetKey] += tmp;
+    }
+}
+
+kernel void querySelfKeySeq4BackwardHalf(
+    const device half4 * outsPrev,
+    const device half * delta,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev1;
+    uint nbNeuronsPrev2;
+    uint nbBlocksPrev;
+    uint queryOffset, keyOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        outsPrev && delta && deltaPrev)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev1 = *pNbNeuronsPrev;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeuronsPrev2 = nbNeuronsPrev1 / nbBlocksPrev;
+        queryOffset = pGlobalOffset[0];
+        keyOffset = pGlobalOffset[1];
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+        size = nbNeuronsPrev2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depthPrev = j * 4 + head * size;
+    
+    if (head >= nbHeads || j * 4 >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset = seqK + head * sequence +
+            nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetQuery = (depthPrev + queryOffset * nbNeuronsPrev2 +
+            nbNeuronsPrev1 * seqQ + sequence * nbNeuronsPrev1 * elem) / 4;
+        
+        tmp += delta[offset] * outsPrev[offsetQuery];
+    }
+    tmp /= sqrt((float)size);
+    
+    uint offsetKey = (depthPrev + keyOffset * nbNeuronsPrev2 +
+        nbNeuronsPrev1 * seqK + sequence * nbNeuronsPrev1 * elem) / 4;
+    
+    if (dirty)
+    {
+        deltaPrev[offsetKey] = tmp;
+    }
+    else
+    {
+        deltaPrev[offsetKey] += tmp;
+    }
+}
+
+kernel void softmaxSeqForwardHalf(
+    const device half * outsPrev,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbBatch && pSequence && outsPrev && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    uint head = depth / size;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    float cMax = outsPrev[
+        0+head*size + nbNeurons * seq + sequence * nbNeurons * elem
+    ];
+    for (uint j=0; j<size; j++)
+    {
+        uint offset1 = j+head*size +
+            nbNeurons * seq + sequence * nbNeurons * elem;
+        
+        float outPrev = outsPrev[offset1];
+        if (outPrev > cMax)
+        {
+            cMax = outPrev;
+        }
+    }
+    
+    float sum1 = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint offset1 = j+head*size +
+            nbNeurons * seq + sequence * nbNeurons * elem;
+        
+        float outPrev = outsPrev[offset1];
+        sum1 += exp(outPrev - cMax);
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    float outPrev = outsPrev[offset];
+    outs[offset] = exp(outPrev - cMax) / sum1;
+}
+
+kernel void softmaxSeq4ForwardHalf(
+    const device half4 * outsPrev,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbBatch && pSequence && outsPrev && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    uint head = depth / (size / 4);
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    float cMax = outsPrev[
+        (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4
+    ][0];
+    for (uint j=0; j<size/4; j++)
+    {
+        uint offset1 = (j*4+head*size +
+            nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        
+        half4 outPrev = outsPrev[offset1];
+        float max1 = max(outPrev[0], outPrev[1]);
+        float max2 = max(outPrev[2], outPrev[3]);
+        float max3 = max(max1, max2);
+        if (max3 > cMax)
+        {
+            cMax = max3;
+        }
+    }
+    
+    half4 sum1 = 0.0;
+    for (uint j=0; j<size/4; j++)
+    {
+        uint offset1 = (j*4+head*size +
+            nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        
+        half4 outPrev = outsPrev[offset1];
+        sum1 += exp(outPrev - cMax);
+    }
+    
+    float sum2 = sum1[0] + sum1[1] + sum1[2] + sum1[3];
+    
+    uint offset =
+        (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+    half4 outPrev = outsPrev[offset];
+    outs[offset] = exp(outPrev - cMax) / sum2;
+}
+
+kernel void softmaxSeqBackwardHalf(
+    const device half * outs,
+    const device half * delta,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbBatch && pSequence && pDirty &&
+        deltaPrev && outs && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    uint head = depth / size;
+    
+    if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    float outCur = outs[offset];
+    float deltaCur = delta[offset];
+    
+    float sum1 = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint offset1 = j+head*size +
+            nbNeurons * seq + sequence * nbNeurons * elem;
+        
+        float outCur1 = outs[offset1];
+        float deltaCur1 = delta[offset1];
+        sum1 += outCur1 * deltaCur1;
+    }
+    
+    if (dirty)
+    {
+        deltaPrev[offset] = outCur * (deltaCur - sum1);
+    }
+    else
+    {
+        deltaPrev[offset] += outCur * (deltaCur - sum1);
+    }
+}
+
+kernel void softmaxSeq4BackwardHalf(
+    const device half4 * outs,
+    const device half4 * delta,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbBatch && pSequence && pDirty &&
+        deltaPrev && outs && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    uint head = depth / (size / 4);
+    
+    if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence)
+    {
+        return ;
+    }
+    
+    uint offset =
+        (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+    half4 outCur = outs[offset];
+    half4 deltaCur = delta[offset];
+    
+    half4 sum1 = 0.0;
+    for (uint j=0; j<size/4; j++)
+    {
+        uint offset1 = (j*4+head*size +
+            nbNeurons * seq + sequence * nbNeurons * elem) / 4;
+        
+        half4 outCur1 = outs[offset1];
+        half4 deltaCur1 = delta[offset1];
+        sum1 += outCur1 * deltaCur1;
+    }
+    
+    float sum2 = sum1[0] + sum1[1] + sum1[2] + sum1[3];
+    if (dirty)
+    {
+        deltaPrev[offset] = outCur * (deltaCur - sum2);
+    }
+    else
+    {
+        deltaPrev[offset] += outCur * (deltaCur - sum2);
+    }
+}
+
+kernel void valueSeqForwardHalf(
+    const device half * value,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        value && score && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depth = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offsetValue = depth +
+            nbNeurons * seqK + sequence * nbNeurons * elem;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+        
+        tmp += value[offsetValue] * score[offsetScore];
+    }
+    
+    uint offset = depth + nbNeurons * seqQ + sequence * nbNeurons * elem;
+    outs[offset] = tmp;
+}
+
+kernel void valueSeq4ForwardHalf(
+    const device half4 * value,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence &&
+        value && score && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depth = j * 4 + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offsetValue = (depth +
+            nbNeurons * seqK + sequence * nbNeurons * elem) / 4;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+        
+        tmp += value[offsetValue] * score[offsetScore];
+    }
+    
+    uint offset = (depth + nbNeurons * seqQ + sequence * nbNeurons * elem) / 4;
+    outs[offset] = tmp;
+}
+
+kernel void valueValueSeqBackwardHalf(
+    const device half * delta,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * value,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depth = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset = depth + nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ +
+            sequence * nbNeuronsPrev * elem;
+        
+        tmp += delta[offset] * score[offsetScore];
+    }
+    
+    uint offsetValue = depth + nbNeurons * seqK + sequence * nbNeurons * elem;
+    if (dirty)
+    {
+        value[offsetValue] = tmp;
+    }
+    else
+    {
+        value[offsetValue] += tmp;
+    }
+}
+
+kernel void valueValueSeq4BackwardHalf(
+    const device half4 * delta,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half4 * value,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depth = j * 4 + head * size;
+    
+    if (head >= nbHeads || j * 4 >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset =
+            (depth + nbNeurons * seqQ + sequence * nbNeurons * elem) / 4;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ +
+            sequence * nbNeuronsPrev * elem;
+        
+        tmp += delta[offset] * score[offsetScore];
+    }
+    
+    uint offsetValue =
+        (depth + nbNeurons * seqK + sequence * nbNeurons * elem) / 4;
+    if (dirty)
+    {
+        value[offsetValue] = tmp;
+    }
+    else
+    {
+        value[offsetValue] += tmp;
+    }
+}
+
+kernel void valueScoreSeqBackwardHalf(
+    const device half * delta,
+    const device half * value,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * score,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint depth = j + head * size;
+        
+        uint offset = depth + nbNeurons * seqQ + sequence * nbNeurons * elem;
+        uint offsetValue = depth +
+            nbNeurons * seqK + sequence * nbNeurons * elem;
+        
+        tmp += delta[offset] * value[offsetValue];
+    }
+    
+    uint offsetScore = seqK + head * sequence +
+        nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+    
+    if (dirty)
+    {
+        score[offsetScore] = tmp;
+    }
+    else
+    {
+        score[offsetScore] += tmp;
+    }
+}
+
+kernel void valueScoreSeq4BackwardHalf(
+    const device half4 * delta,
+    const device half4 * value,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * score,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons;
+    uint nbNeuronsPrev;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBatch && pSequence && pDirty &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons = *pNbNeurons;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint j=0; j<size/4; j++)
+    {
+        uint depth = j * 4 + head * size;
+        
+        uint offset =
+            (depth + nbNeurons * seqQ + sequence * nbNeurons * elem) / 4;
+        uint offsetValue = (depth +
+            nbNeurons * seqK + sequence * nbNeurons * elem) / 4;
+        
+        tmp += delta[offset] * value[offsetValue];
+    }
+    
+    uint offsetScore = seqK + head * sequence +
+        nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+    
+    if (dirty)
+    {
+        score[offsetScore] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    }
+    else
+    {
+        score[offsetScore] += tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    }
+}
+
+kernel void valueSelfSeqForwardHalf(
+    const device half * value,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons1;
+    uint nbNeurons2;
+    uint nbNeuronsPrev;
+    uint nbBlocksPrev;
+    uint valueOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
+        value && score && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons2 = *pNbNeurons;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeurons1 = nbNeurons2 * nbBlocksPrev;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        valueOffset = *pGlobalOffset;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depth = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offsetValue = depth + valueOffset * nbNeurons2 +
+            nbNeurons1 * seqK + sequence * nbNeurons1 * elem;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+        
+        tmp += value[offsetValue] * score[offsetScore];
+    }
+    
+    uint offset = depth + nbNeurons2 * seqQ + sequence * nbNeurons2 * elem;
+    outs[offset] = tmp;
+}
+
+kernel void valueSelfSeq4ForwardHalf(
+    const device half4 * value,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons1;
+    uint nbNeurons2;
+    uint nbNeuronsPrev;
+    uint nbBlocksPrev;
+    uint valueOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
+        value && score && outs)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons2 = *pNbNeurons;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeurons1 = nbNeurons2 * nbBlocksPrev;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        valueOffset = *pGlobalOffset;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    uint depth = j * 4 + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqK=0; seqK<sequence; seqK++)
+    {
+        uint offsetValue = (depth + valueOffset * nbNeurons2 +
+            nbNeurons1 * seqK + sequence * nbNeurons1 * elem) / 4;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+        
+        tmp += value[offsetValue] * score[offsetScore];
+    }
+    
+    uint offset =
+        (depth + nbNeurons2 * seqQ + sequence * nbNeurons2 * elem) / 4;
+    outs[offset] = tmp;
+}
+
+kernel void valueSelfValueSeqBackwardHalf(
+    const device half * delta,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * value,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons1;
+    uint nbNeurons2;
+    uint nbNeuronsPrev;
+    uint nbBlocksPrev;
+    uint valueOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons2 = *pNbNeurons;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeurons1 = nbNeurons2 * nbBlocksPrev;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        valueOffset = *pGlobalOffset;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / size;
+    uint j = id[0] % size;
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depth = j + head * size;
+    
+    if (head >= nbHeads || j >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset = depth + nbNeurons2 * seqQ + sequence * nbNeurons2 * elem;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ +
+            sequence * nbNeuronsPrev * elem;
+        
+        tmp += delta[offset] * score[offsetScore];
+    }
+    
+    uint offsetValue = depth + valueOffset * nbNeurons2 +
+        nbNeurons1 * seqK + sequence * nbNeurons1 * elem;
+    value[offsetValue] += tmp;
+}
+
+kernel void valueSelfValueSeq4BackwardHalf(
+    const device half4 * delta,
+    const device half * score,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half4 * value,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons1;
+    uint nbNeurons2;
+    uint nbNeuronsPrev;
+    uint nbBlocksPrev;
+    uint valueOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons2 = *pNbNeurons;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeurons1 = nbNeurons2 * nbBlocksPrev;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        valueOffset = *pGlobalOffset;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons2 / nbHeads;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / (size / 4);
+    uint j = id[0] % (size / 4);
+    uint elem = id[1] / sequence;
+    uint seqK = id[1] % sequence;
+    uint depth = j * 4 + head * size;
+    
+    if (head >= nbHeads || j * 4 >= size ||
+        elem >= nbBatch || seqK >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint seqQ=0; seqQ<sequence; seqQ++)
+    {
+        uint offset =
+            (depth + nbNeurons2 * seqQ + sequence * nbNeurons2 * elem) / 4;
+        uint offsetScore = seqK + head * sequence +
+            nbNeuronsPrev * seqQ +
+            sequence * nbNeuronsPrev * elem;
+        
+        tmp += delta[offset] * score[offsetScore];
+    }
+    
+    uint offsetValue = (depth + valueOffset * nbNeurons2 +
+        nbNeurons1 * seqK + sequence * nbNeurons1 * elem) / 4;
+    value[offsetValue] += tmp;
+}
+
+kernel void valueSelfScoreSeqBackwardHalf(
+    const device half * delta,
+    const device half * value,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * score,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons1;
+    uint nbNeurons2;
+    uint nbNeuronsPrev;
+    uint nbBlocksPrev;
+    uint valueOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons2 = *pNbNeurons;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeurons1 = nbNeurons2 * nbBlocksPrev;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        valueOffset = *pGlobalOffset;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons2 / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint j=0; j<size; j++)
+    {
+        uint depth = j + head * size;
+        
+        uint offset = depth + nbNeurons2 * seqQ + sequence * nbNeurons2 * elem;
+        uint offsetValue = depth + valueOffset * nbNeurons2 +
+            nbNeurons1 * seqK + sequence * nbNeurons1 * elem;
+        
+        tmp += delta[offset] * value[offsetValue];
+    }
+    
+    uint offsetScore = seqK + head * sequence +
+        nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+    
+    if (dirty)
+    {
+        score[offsetScore] = tmp;
+    }
+    else
+    {
+        score[offsetScore] += tmp;
+    }
+}
+
+kernel void valueSelfScoreSeq4BackwardHalf(
+    const device half4 * delta,
+    const device half4 * value,
+    constant uint * pNbHeads,
+    constant uint * pNbNeurons,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pNbBlocksPrev,
+    constant uint * pGlobalOffset,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * score,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbHeads;
+    uint nbNeurons1;
+    uint nbNeurons2;
+    uint nbNeuronsPrev;
+    uint nbBlocksPrev;
+    uint valueOffset;
+    uint nbBatch;
+    uint sequence;
+    uint size;
+    uint dirty;
+    
+    if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        value && score && delta)
+    {
+        nbHeads = *pNbHeads;
+        nbNeurons2 = *pNbNeurons;
+        nbBlocksPrev = *pNbBlocksPrev;
+        nbNeurons1 = nbNeurons2 * nbBlocksPrev;
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        valueOffset = *pGlobalOffset;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        size = nbNeurons2 / nbHeads;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint head = id[0] / sequence;
+    uint seqK = id[0] % sequence;
+    uint elem = id[1] / sequence;
+    uint seqQ = id[1] % sequence;
+    
+    if (head >= nbHeads || seqK >= sequence ||
+        elem >= nbBatch || seqQ >= sequence)
+    {
+        return ;
+    }
+    
+    half4 tmp = 0.0;
+    for (uint j=0; j<size/4; j++)
+    {
+        uint depth = j * 4 + head * size;
+        
+        uint offset =
+            (depth + nbNeurons2 * seqQ + sequence * nbNeurons2 * elem) / 4;
+        uint offsetValue = (depth + valueOffset * nbNeurons2 +
+            nbNeurons1 * seqK + sequence * nbNeurons1 * elem) / 4;
+        
+        tmp += delta[offset] * value[offsetValue];
+    }
+    
+    uint offsetScore = seqK + head * sequence +
+        nbNeuronsPrev * seqQ + sequence * nbNeuronsPrev * elem;
+    
+    if (dirty)
+    {
+        score[offsetScore] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    }
+    else
+    {
+        score[offsetScore] += tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    }
+}
+
+kernel void layerCAMSeqForwardHalf(
+    const device half * outsPrev,
+    const device half * deltaPrev,
+    constant uint * pNbNeuronsPrev,
+    constant uint * pKeepPositive,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbBatch;
+    uint sequence;
+    uint nbNeuronsPrev;
+    uint keepPositive;
+    
+    if (pNbNeuronsPrev && pKeepPositive && pNbBatch && pSequence &&
+        outsPrev && outs)
+    {
+        nbNeuronsPrev = *pNbNeuronsPrev;
+        keepPositive = *pKeepPositive;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint seq = id[0];
+    uint elem = id[1];
+    
+    if (seq >= sequence || elem >= nbBatch)
+    {
+        return ;
+    }
+        
+    float sum = 0.0;
+    for (uint depthPrev=0; depthPrev<nbNeuronsPrev; depthPrev++)
+    {
+        uint offsetPrev = depthPrev + nbNeuronsPrev * seq +
+            sequence * nbNeuronsPrev * elem;
+        
+        float deltaPrevTmp = deltaPrev[offsetPrev];
+        if (!keepPositive)
+        {
+            deltaPrevTmp = -deltaPrevTmp;
+        }
+        if (deltaPrevTmp < 0)
+        {
+            deltaPrevTmp = 0.0;
+        }
+        
+        sum += deltaPrevTmp * outsPrev[offsetPrev];
+    }
+    
+    uint offset = seq + sequence * elem;
+    outs[offset] = sum;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Optimizer.metal b/Sources/GrAIdient/Metal/Kernel/OptimizerFloat.metal
similarity index 96%
rename from Sources/GrAIdient/Metal/Kernel/Optimizer.metal
rename to Sources/GrAIdient/Metal/Kernel/OptimizerFloat.metal
index c517106c..e451bb34 100644
--- a/Sources/GrAIdient/Metal/Kernel/Optimizer.metal
+++ b/Sources/GrAIdient/Metal/Kernel/OptimizerFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void clipGradients(
+kernel void clipGradientsFloat(
     constant uint * pNbElems,
     constant float * pGradientNorm,
     constant float * pNormThreshold,
@@ -36,7 +36,7 @@ kernel void clipGradients(
     grads[id] = grads[id] * normThreshold / gradientNorm;
 }
 
-kernel void multiplyGradients(
+kernel void multiplyGradientsFloat(
     constant uint * pNbElems,
     constant float * pFactor,
     device float * grads,
@@ -61,7 +61,7 @@ kernel void multiplyGradients(
     grads[id] = grads[id] * factor;
 }
 
-kernel void weightsSGD(
+kernel void weightsSGDFloat(
     const device float * grads,
     constant uint * pNbElems,
     constant float * pAlpha,
@@ -94,7 +94,7 @@ kernel void weightsSGD(
     weights[id] = weights[id] - alpha * g;
 }
 
-kernel void weightsMomentum(
+kernel void weightsMomentumFloat(
     const device float * grads,
     constant uint * pNbElems,
     constant float * pAlpha,
@@ -133,7 +133,7 @@ kernel void weightsMomentum(
     weights[id] = weights[id] - v;
 }
 
-kernel void weightsAdam(
+kernel void weightsAdamFloat(
     const device float * grads,
     constant uint * pNbElems,
     constant float * pAlpha,
@@ -184,7 +184,7 @@ kernel void weightsAdam(
     weights[id] = weights[id] - alpha * m / (sqrt(v) + Ɛ);
 }
 
-kernel void weightsAMSGrad(
+kernel void weightsAMSGradFloat(
     const device float * grads,
     constant uint * pNbElems,
     constant float * pAlpha,
@@ -239,7 +239,7 @@ kernel void weightsAMSGrad(
     weights[id] = weights[id] - alpha * m / (sqrt(vHat) + Ɛ);
 }
 
-kernel void weightsAdamRectified(
+kernel void weightsAdamRectifiedFloat(
     const device float * grads,
     constant uint * pNbElems,
     constant float * pAlpha,
@@ -302,7 +302,7 @@ kernel void weightsAdamRectified(
     }
 }
 
-kernel void weightsAdaBound(
+kernel void weightsAdaBoundFloat(
     const device float * grads,
     constant uint * pNbElems,
     constant float * pAlpha,
@@ -368,7 +368,7 @@ kernel void weightsAdaBound(
     weights[id] = weights[id] - alphaHat * m;
 }
 
-kernel void weightsAMSBound(
+kernel void weightsAMSBoundFloat(
     const device float * grads,
     constant uint * pNbElems,
     constant float * pAlpha,
diff --git a/Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal b/Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal
new file mode 100644
index 00000000..ea7c7ce8
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal
@@ -0,0 +1,438 @@
+//
+// Optimizer.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 09/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void clipGradientsHalf(
+    constant uint * pNbElems,
+    constant float * pGradientNorm,
+    constant float * pNormThreshold,
+    device half * grads,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float gradientNorm;
+    float normThreshold;
+    
+    if (pNbElems && pGradientNorm && pNormThreshold && grads)
+    {
+        nbElems = *pNbElems;
+        gradientNorm = *pGradientNorm;
+        normThreshold = *pNormThreshold;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    grads[id] = grads[id] * normThreshold / gradientNorm;
+}
+
+kernel void multiplyGradientsHalf(
+    constant uint * pNbElems,
+    constant float * pFactor,
+    device half * grads,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float factor;
+    
+    if (pNbElems && pFactor && grads)
+    {
+        nbElems = *pNbElems;
+        factor = *pFactor;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    grads[id] = grads[id] * factor;
+}
+
+kernel void weightsSGDHalf(
+    const device half * grads,
+    constant uint * pNbElems,
+    constant float * pAlpha,
+    constant float * pLambda,
+    device half * weights,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float alpha, lambda;
+    
+    if (pNbElems && pAlpha && pLambda && grads && weights)
+    {
+        nbElems = pNbElems[0];
+        alpha = *pAlpha;
+        lambda = *pLambda;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float g = grads[id];
+    if (lambda != 0)
+    {
+        g += lambda * weights[id];
+    }
+    weights[id] = weights[id] - alpha * g;
+}
+
+kernel void weightsMomentumHalf(
+    const device half * grads,
+    constant uint * pNbElems,
+    constant float * pAlpha,
+    constant float * pLambda,
+    device half * weights,
+    device half * mPtr,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float alpha, lambda;
+    float β1 = 0.9;
+    
+    if (pNbElems && pAlpha && pLambda && grads && weights && mPtr)
+    {
+        nbElems = pNbElems[0];
+        alpha = *pAlpha;
+        lambda = *pLambda;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float g = grads[id];
+    if (lambda != 0)
+    {
+        g += lambda * weights[id];
+    }
+    
+    float v = β1 * mPtr[id] + alpha * g;
+    mPtr[id] = v;
+    
+    weights[id] = weights[id] - v;
+}
+
+kernel void weightsAdamHalf(
+    const device half * grads,
+    constant uint * pNbElems,
+    constant float * pAlpha,
+    constant float * pLambda,
+    constant float * pT,
+    device half * weights,
+    device half * mPtr,
+    device half * vPtr,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float alpha, lambda;
+    float t;
+    float β1 = 0.9;
+    float β2 = 0.999;
+    float Ɛ = 0.00000001;
+    
+    if (pNbElems && pAlpha && pLambda && pT &&
+        grads && weights && mPtr && vPtr)
+    {
+        nbElems = pNbElems[0];
+        alpha = *pAlpha;
+        lambda = *pLambda;
+        t = *pT;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float g = grads[id];
+    if (lambda != 0)
+    {
+        g += lambda * weights[id];
+    }
+    
+    float m = β1 * mPtr[id] + (1 - β1) * g;
+    float v = β2 * vPtr[id] + (1 - β2) * g * g;
+    mPtr[id] = m;
+    vPtr[id] = v;
+    
+    m /= (1 - pow(β1, t));
+    v /= (1 - pow(β2, t));
+    
+    weights[id] = weights[id] - alpha * m / (sqrt(v) + Ɛ);
+}
+
+kernel void weightsAMSGradHalf(
+    const device half * grads,
+    constant uint * pNbElems,
+    constant float * pAlpha,
+    constant float * pLambda,
+    constant float * pT,
+    device half * weights,
+    device half * mPtr,
+    device half * vPtr,
+    device half * vHatPtr,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float alpha, lambda;
+    float t;
+    float β1 = 0.9;
+    float β2 = 0.999;
+    float Ɛ = 0.00000001;
+    
+    if (pNbElems && pAlpha && pLambda && pT &&
+        grads && weights && mPtr && vPtr && vHatPtr)
+    {
+        nbElems = pNbElems[0];
+        alpha = *pAlpha;
+        lambda = *pLambda;
+        t = *pT;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float g = grads[id];
+    if (lambda != 0)
+    {
+        g += lambda * weights[id];
+    }
+    
+    half m = β1 * mPtr[id] + (1 - β1) * g;
+    half v = β2 * vPtr[id] + (1 - β2) * g * g;
+    half vHat = max(v, vHatPtr[id]);
+    
+    mPtr[id] = m;
+    vPtr[id] = v;
+    vHatPtr[id] = vHat;
+    
+    m /= (1 - pow(β1, t));
+    vHat /= (1 - pow(β2, t));
+    
+    weights[id] = weights[id] - alpha * m / (sqrt(vHat) + Ɛ);
+}
+
+kernel void weightsAdamRectifiedHalf(
+    const device half * grads,
+    constant uint * pNbElems,
+    constant float * pAlpha,
+    constant float * pLambda,
+    constant float * pT,
+    device half * weights,
+    device half * mPtr,
+    device half * vPtr,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float alpha, lambda;
+    float t;
+    float β1 = 0.9;
+    float β2 = 0.999;
+    float Ɛ = 0.00000001;
+    float ρinf = 2.0 / (1.0 - β2) - 1.0;
+    
+    if (pNbElems && pAlpha && pLambda && pT &&
+        grads && weights && mPtr && vPtr)
+    {
+        nbElems = pNbElems[0];
+        alpha = *pAlpha;
+        lambda = *pLambda;
+        t = *pT;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float g = grads[id];
+    if (lambda != 0)
+    {
+        g += lambda * weights[id];
+    }
+    
+    float m = β1 * mPtr[id] + (1 - β1) * g;
+    float v = β2 * vPtr[id] + (1 - β2) * g * g;
+    mPtr[id] = m;
+    vPtr[id] = v;
+    
+    m /= (1 - pow(β1, t));
+    float ρ = ρinf - 2.0 * t * pow(β2, t) / (1 - pow(β2, t));
+    
+    if (ρ > 5.0)
+    {
+        float l = sqrt((1 - pow(β2, t)) / (v + Ɛ));
+        float r = sqrt(((ρ - 4.0) * (ρ - 2.0) * ρinf) /
+                       ((ρinf - 4.0) * (ρinf - 2.0) * ρ));
+        
+        weights[id] = weights[id] - alpha * m * r * l;
+    }
+    else
+    {
+        weights[id] = weights[id] - alpha * m;
+    }
+}
+
+kernel void weightsAdaBoundHalf(
+    const device half * grads,
+    constant uint * pNbElems,
+    constant float * pAlpha,
+    constant float * pLambda,
+    constant float * pT,
+    constant float * pLowerBound,
+    constant float * pUpperBound,
+    device half * weights,
+    device half * mPtr,
+    device half * vPtr,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float alpha, lambda;
+    float t;
+    float β1 = 0.9;
+    float β2 = 0.999;
+    float Ɛ = 0.00000001;
+    float lowerBound;
+    float upperBound;
+    
+    if (pNbElems && pAlpha && pLambda && pT && pLowerBound && pUpperBound &&
+        grads && weights && mPtr && vPtr)
+    {
+        nbElems = pNbElems[0];
+        alpha = *pAlpha;
+        lambda = *pLambda;
+        t = *pT;
+        lowerBound = *pLowerBound;
+        upperBound = *pUpperBound;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float g = grads[id];
+    if (lambda != 0)
+    {
+        g += lambda * weights[id];
+    }
+    
+    float m = β1 * mPtr[id] + (1 - β1) * g;
+    float v = β2 * vPtr[id] + (1 - β2) * g * g;
+    
+    mPtr[id] = m;
+    vPtr[id] = v;
+    
+    float alphaHat = alpha *
+        sqrt(1 - pow(β2, t)) / ((sqrt(v) + Ɛ) * (1 - pow(β1, t)));
+    if (alphaHat < lowerBound)
+    {
+        alphaHat = lowerBound;
+    }
+    else if (alphaHat > upperBound)
+    {
+        alphaHat = upperBound;
+    }
+    
+    weights[id] = weights[id] - alphaHat * m;
+}
+
+kernel void weightsAMSBoundHalf(
+    const device half * grads,
+    constant uint * pNbElems,
+    constant float * pAlpha,
+    constant float * pLambda,
+    constant float * pT,
+    constant float * pLowerBound,
+    constant float * pUpperBound,
+    device half * weights,
+    device half * mPtr,
+    device half * vPtr,
+    device half * vHatPtr,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    float alpha, lambda;
+    float t;
+    float β1 = 0.9;
+    float β2 = 0.999;
+    float Ɛ = 0.00000001;
+    float lowerBound;
+    float upperBound;
+    
+    if (pNbElems && pAlpha && pLambda && pT && pLowerBound && pUpperBound &&
+        grads && weights && mPtr && vPtr && vHatPtr)
+    {
+        nbElems = pNbElems[0];
+        alpha = *pAlpha;
+        lambda = *pLambda;
+        t = *pT;
+        lowerBound = *pLowerBound;
+        upperBound = *pUpperBound;
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float g = grads[id];
+    if (lambda != 0)
+    {
+        g += lambda * weights[id];
+    }
+    
+    half m = β1 * mPtr[id] + (1 - β1) * g;
+    half v = β2 * vPtr[id] + (1 - β2) * g * g;
+    half vHat = max(v, vHatPtr[id]);
+    
+    mPtr[id] = m;
+    vPtr[id] = v;
+    vHatPtr[id] = vHat;
+    
+    float alphaHat = alpha *
+        sqrt(1 - pow(β2, t)) / ((sqrt(vHat) + Ɛ) * (1 - pow(β1, t)));
+    if (alphaHat < lowerBound)
+    {
+        alphaHat = lowerBound;
+    }
+    else if (alphaHat > upperBound)
+    {
+        alphaHat = upperBound;
+    }
+    
+    weights[id] = weights[id] - alphaHat * m;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Reduce.metal b/Sources/GrAIdient/Metal/Kernel/ReduceFloat.metal
similarity index 97%
rename from Sources/GrAIdient/Metal/Kernel/Reduce.metal
rename to Sources/GrAIdient/Metal/Kernel/ReduceFloat.metal
index 4fd9fd1b..e390ae83 100644
--- a/Sources/GrAIdient/Metal/Kernel/Reduce.metal
+++ b/Sources/GrAIdient/Metal/Kernel/ReduceFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void reduceSum64(
+kernel void reduceSum64Float(
      const device float * ins,
      constant uint * pDimensions,
      constant uint * pNbThreadgroups,
@@ -62,7 +62,7 @@ kernel void reduceSum64(
     }
 }
 
-kernel void reduceSum(
+kernel void reduceSumFloat(
      const device float * ins,
      constant uint * pDimensions,
      device float * outs,
@@ -94,7 +94,7 @@ kernel void reduceSum(
     outs[elem2] = sum;
 }
 
-kernel void reduceMax64(
+kernel void reduceMax64Float(
      const device float * ins,
      constant uint * pDimensions,
      constant uint * pNbThreadgroups,
@@ -151,7 +151,7 @@ kernel void reduceMax64(
     }
 }
 
-kernel void reduceMax(
+kernel void reduceMaxFloat(
      const device float * ins,
      constant uint * pDimensions,
      device float * outs,
diff --git a/Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal b/Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal
new file mode 100644
index 00000000..99662efb
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal
@@ -0,0 +1,184 @@
+//
+// Reduce.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 17/05/2023.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void reduceSum64Half(
+     const device half * ins,
+     constant uint * pDimensions,
+     constant uint * pNbThreadgroups,
+     device half * outs,
+     uint2 groupId [[ threadgroup_position_in_grid ]],
+     uint2 threadId [[ thread_position_in_threadgroup ]],
+     uint2 id [[ thread_position_in_grid ]])
+{
+    constexpr uint threadsPerThreadgroup = 64;
+    threadgroup float sumShared[threadsPerThreadgroup];
+    
+    uint dim1;
+    uint dim2;
+    uint nbThreadgroups;
+    
+    if (pDimensions && pNbThreadgroups && ins && outs)
+    {
+        dim1 = pDimensions[0];
+        dim2 = pDimensions[1];
+        nbThreadgroups = *pNbThreadgroups;
+    }
+    else
+        return ;
+    
+    uint elem1 = id[0];
+    uint elem2 = id[1];
+    
+    if (elem1 >= dim1 && elem2 >= dim2)
+    {
+        return ;
+    }
+    
+    uint offset = elem2 * dim1 + elem1;
+    sumShared[threadId[0]] = ins[offset];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1)
+    {
+        uint index = threadId[0] + groupId[0] * threadsPerThreadgroup;
+        if (threadId[0] < stride && (index + stride) < dim1)
+        {
+            sumShared[threadId[0]] += sumShared[threadId[0] + stride];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    
+    if (threadId[0] == 0)
+    {
+        uint offset = elem2 * nbThreadgroups + groupId[0];
+        outs[offset] = sumShared[0];
+    }
+}
+
+kernel void reduceSumHalf(
+     const device half * ins,
+     constant uint * pDimensions,
+     device half * outs,
+     uint id [[ thread_position_in_grid ]])
+{
+    uint dim1;
+    uint dim2;
+    
+    if (pDimensions && ins && outs)
+    {
+        dim1 = pDimensions[0];
+        dim2 = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint elem2 = id;
+    if (elem2 >= dim2)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint elem1=0; elem1<dim1; elem1++)
+    {
+        uint offset = elem2 * dim1 + elem1;
+        sum += ins[offset];
+    }
+    outs[elem2] = sum;
+}
+
+kernel void reduceMax64Half(
+     const device half * ins,
+     constant uint * pDimensions,
+     constant uint * pNbThreadgroups,
+     device half * outs,
+     uint2 groupId [[ threadgroup_position_in_grid ]],
+     uint2 threadId [[ thread_position_in_threadgroup ]],
+     uint2 id [[ thread_position_in_grid ]])
+{
+    constexpr uint threadsPerThreadgroup = 64;
+    threadgroup float valShared[threadsPerThreadgroup];
+    
+    uint dim1;
+    uint dim2;
+    uint nbThreadgroups;
+    
+    if (pDimensions && pNbThreadgroups && ins && outs)
+    {
+        dim1 = pDimensions[0];
+        dim2 = pDimensions[1];
+        nbThreadgroups = *pNbThreadgroups;
+    }
+    else
+        return ;
+    
+    uint elem1 = id[0];
+    uint elem2 = id[1];
+    
+    if (elem1 >= dim1 && elem2 >= dim2)
+    {
+        return ;
+    }
+    
+    uint offset = elem2 * dim1 + elem1;
+    valShared[threadId[0]] = ins[offset];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1)
+    {
+        uint index = threadId[0] + groupId[0] * threadsPerThreadgroup;
+        if (threadId[0] < stride && (index + stride) < dim1)
+        {
+            valShared[threadId[0]] = max(
+                 valShared[threadId[0] + stride],
+                 valShared[threadId[0]]
+             );
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    
+    if (threadId[0] == 0)
+    {
+        uint offset = elem2 * nbThreadgroups + groupId[0];
+        outs[offset] = valShared[0];
+    }
+}
+
+kernel void reduceMaxHalf(
+     const device half * ins,
+     constant uint * pDimensions,
+     device half * outs,
+     uint id [[ thread_position_in_grid ]])
+{
+    uint dim1;
+    uint dim2;
+    
+    if (pDimensions && ins && outs)
+    {
+        dim1 = pDimensions[0];
+        dim2 = pDimensions[1];
+    }
+    else
+        return ;
+    
+    uint elem2 = id;
+    if (elem2 >= dim2)
+    {
+        return ;
+    }
+    
+    half val = ins[elem2 * dim1];
+    for (uint elem1=0; elem1<dim1; elem1++)
+    {
+        uint offset = elem2 * dim1 + elem1;
+        val = max(ins[offset], val);
+    }
+    outs[elem2] = val;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/Reset.metal b/Sources/GrAIdient/Metal/Kernel/ResetFloat.metal
similarity index 94%
rename from Sources/GrAIdient/Metal/Kernel/Reset.metal
rename to Sources/GrAIdient/Metal/Kernel/ResetFloat.metal
index 59a343b7..36f3ddf4 100644
--- a/Sources/GrAIdient/Metal/Kernel/Reset.metal
+++ b/Sources/GrAIdient/Metal/Kernel/ResetFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void reset(
+kernel void resetFloat(
     constant uint * pNbElems,
     device float * outs,
     uint id [[ thread_position_in_grid ]])
diff --git a/Sources/GrAIdient/Metal/Kernel/ResetHalf.metal b/Sources/GrAIdient/Metal/Kernel/ResetHalf.metal
new file mode 100644
index 00000000..6fadea01
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/ResetHalf.metal
@@ -0,0 +1,77 @@
+//
+// Reset.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 14/10/2022.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void resetHalf(
+    constant uint * pNbElems,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] = 0.0;
+}
+
+kernel void convertFloat2Half(
+    constant float * ins,
+    constant uint * pNbElems,
+    device half * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] = (half)ins[id];
+}
+
+kernel void convertHalf2Float(
+    constant half * ins,
+    constant uint * pNbElems,
+    device float * outs,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems && outs)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    outs[id] = (float)ins[id];
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/VQ2D.metal b/Sources/GrAIdient/Metal/Kernel/VQ2DFloat.metal
similarity index 98%
rename from Sources/GrAIdient/Metal/Kernel/VQ2D.metal
rename to Sources/GrAIdient/Metal/Kernel/VQ2DFloat.metal
index 720a64b6..10f74050 100644
--- a/Sources/GrAIdient/Metal/Kernel/VQ2D.metal
+++ b/Sources/GrAIdient/Metal/Kernel/VQ2DFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void vq2DForward(
+kernel void vq2DForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     constant uint * pNbChannels,
@@ -83,7 +83,7 @@ kernel void vq2DForward(
     }
 }
 
-kernel void vq2DBackward(
+kernel void vq2DBackwardFloat(
     const device float * outsPrev,
     const device float * delta,
     const device float * weights,
@@ -160,7 +160,7 @@ kernel void vq2DBackward(
     }
 }
 
-kernel void vq2DBatchDerWeights(
+kernel void vq2DBatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * weights,
     const device int * indices,
@@ -223,7 +223,7 @@ kernel void vq2DBatchDerWeights(
     grads[depth + nbChannels * k] += sum;
 }
 
-kernel void vq2DDerWeights(
+kernel void vq2DDerWeightsFloat(
     const device float * outsPrev,
     const device float * weights,
     const device int * indices,
@@ -286,7 +286,7 @@ kernel void vq2DDerWeights(
     deltaWeights[depth + nbChannels * k + K * nbChannels * elem] += sum;
 }
 
-kernel void vq2DReduceWeights(
+kernel void vq2DReduceWeightsFloat(
     const device float * deltaWeights,
     constant uint * pNbChannels,
     constant uint * pK,
@@ -336,7 +336,7 @@ kernel void vq2DReduceWeights(
     }
 }
 
-kernel void vq2DLoss(
+kernel void vq2DLossFloat(
     const device float * outsPrev,
     const device float * outs,
     const device int * indices,
@@ -391,7 +391,7 @@ kernel void vq2DLoss(
     losses[elem] = tmp;
 }
 
-kernel void vqLayerCAMMax2D(
+kernel void vqLayerCAMMax2DFloat(
      const device float * camLayer,
      constant uint * pNbChannels,
      constant uint * pDimensions,
@@ -455,7 +455,7 @@ kernel void vqLayerCAMMax2D(
     }
 }
 
-kernel void vqGrad2DForward(
+kernel void vqGrad2DForwardFloat(
     const device float * outsPrev,
     const device float * camLayer,
     const device float * camMax,
diff --git a/Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal b/Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal
new file mode 100644
index 00000000..d1edee8f
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal
@@ -0,0 +1,544 @@
+//
+// VQ2D.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 29/03/2023.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void vq2DForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pK,
+    constant uint * pNbBatch,
+    device half * outs,
+    device int * indices,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint K;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pK && pNbBatch &&
+        weights && outsPrev && outs && indices)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        K = *pK;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint i = id[0] / width;
+    uint j = id[0] % width;
+    
+    if (i * j >= height * width || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    int minIndex = -1;
+    float minValue = 0.0;
+    for (uint k=0; k<K; k++)
+    {
+        float value = 0.0;
+        for (uint depth=0; depth<nbChannels; depth++)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = j + (offsetStart + i) * width;
+            
+            uint offsetWeights = depth + nbChannels * k;
+            
+            float outPrev = outsPrev[offset];
+            float vq = weights[offsetWeights];
+            value += pow(outPrev - vq, 2.0);
+        }
+        
+        if (minIndex < 0 || value < minValue)
+        {
+            minValue = value;
+            minIndex = k;
+        }
+    }
+    
+    if (minIndex >= 0)
+    {
+        for (uint depth=0; depth<nbChannels; depth++)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = j + (offsetStart + i) * width;
+            
+            uint offsetWeights = depth + nbChannels * minIndex;
+            outs[offset] = weights[offsetWeights];
+        }
+        indices[j + (elem * height + i) * width] = minIndex;
+    }
+}
+
+kernel void vq2DBackwardHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    const device half * weights,
+    const device int * indices,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pK,
+    constant float * pBeta,
+    constant uint * pNbBatch,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint K;
+    float beta;
+    uint nbBatch;
+    uint dirty;
+    
+    if (pNbChannels && pDimensions && pK && pBeta && pNbBatch && pDirty &&
+        outsPrev && delta && weights && indices && deltaPrev)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        K = *pK;
+        beta = *pBeta;
+        nbBatch = *pNbBatch;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0] / width;
+    uint elem = id[1] / height;
+    uint i = id[1] % height;
+    uint j = id[0] % width;
+    
+    if (i * elem >= height * nbBatch ||
+        j * depth >= width * nbChannels)
+    {
+        return ;
+    }
+    
+    uint offsetStart = (depth + nbChannels * elem) * height;
+    uint offset = j + (offsetStart + i) * width;
+    
+    int minIndex = indices[j + (elem * height + i) * width];
+    if (minIndex >= 0)
+    {
+        uint offsetWeights = depth + nbChannels * minIndex;
+        
+        float vq = weights[offsetWeights];
+        float deltaCur = delta[offset];
+        float outPrev = outsPrev[offset];
+        
+        if (dirty)
+        {
+            deltaPrev[offset] = deltaCur;
+        }
+        else
+        {
+            deltaPrev[offset] += deltaCur;
+        }
+        
+        // Commitment term.
+        deltaPrev[offset] += beta / (float)(nbBatch * height * width) *
+            2.0 * (outPrev - vq);
+    }
+    else if (dirty)
+    {
+        deltaPrev[offset] = 0.0;
+    }
+}
+
+kernel void vq2DBatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device int * indices,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pK,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint K;
+    float coeff;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pK && pCoeff && pNbBatch &&
+        outsPrev && weights && indices && grads)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        K = *pK;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint k = id[1];
+    uint depth = id[0];
+    
+    if (depth >= nbChannels || k >= K)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++){
+    for (uint i=0; i<height; i++){
+    for (uint j=0; j<width; j++)
+    {
+        int minIndex = indices[j + (elem * height + i) * width];
+        if (minIndex == (int)k)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = j + (offsetStart + i) * width;
+            
+            uint offsetWeights = depth + nbChannels * minIndex;
+            
+            float vq = weights[offsetWeights];
+            float outPrev = outsPrev[offset];
+            
+            sum += vq - outPrev;
+        }
+    }}}
+    sum *= coeff / (float)(nbBatch * height * width) * 2.0;
+    
+    grads[depth + nbChannels * k] += sum;
+}
+
+kernel void vq2DDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device int * indices,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pK,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint K;
+    float coeff;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pK && pCoeff && pNbBatch &&
+        outsPrev && weights && indices && deltaWeights)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        K = *pK;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1] / K;
+    uint k = id[1] % K;
+    uint depth = id[0];
+    
+    if (depth >= nbChannels || elem * k >= nbBatch * K)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint i=0; i<height; i++){
+    for (uint j=0; j<width; j++)
+    {
+        int minIndex = indices[j + (elem * height + i) * width];
+        if (minIndex == (int)k)
+        {
+            uint offsetStart = (depth + nbChannels * elem) * height;
+            uint offset = j + (offsetStart + i) * width;
+            
+            uint offsetWeights = depth + nbChannels * minIndex;
+            
+            float vq = weights[offsetWeights];
+            float outPrev = outsPrev[offset];
+            
+            sum += vq - outPrev;
+        }
+    }}
+    sum *= coeff / (float)(nbBatch * height * width) * 2.0;
+    
+    deltaWeights[depth + nbChannels * k + K * nbChannels * elem] += sum;
+}
+
+kernel void vq2DReduceWeightsHalf(
+    const device half * deltaWeights,
+    constant uint * pNbChannels,
+    constant uint * pK,
+    constant uint * pNbBatch,
+    constant uint * pAccumulate,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbChannels;
+    uint K;
+    uint nbBatch;
+    uint accumulate;
+    
+    if (pNbChannels && pK && pNbBatch && pAccumulate &&
+        deltaWeights && grads)
+    {
+        nbChannels = *pNbChannels;
+        K = *pK;
+        nbBatch = *pNbBatch;
+        accumulate = *pAccumulate;
+    }
+    else
+        return ;
+    
+    uint k = id[1];
+    uint depth = id[0];
+    
+    if (depth >= nbChannels || k >= K)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++)
+    {
+        uint offset = depth + nbChannels * k + K * nbChannels * elem;
+        sum += deltaWeights[offset];
+    }
+    
+    if (accumulate)
+    {
+        grads[depth + nbChannels * k] += sum;
+    }
+    else
+    {
+        grads[depth + nbChannels * k] = sum;
+    }
+}
+
+kernel void vq2DLossHalf(
+    const device half * outsPrev,
+    const device half * outs,
+    const device int * indices,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pNbBatch,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbBatch &&
+        outsPrev && outs && indices && losses)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbChannels; depth++)
+    {
+        uint offsetStart = (depth + nbChannels * elem) * height;
+        
+        for (uint i=0; i<height; i++) {
+        for (uint j=0; j<width; j++)
+        {
+            int minIndex = indices[j + (elem * height + i) * width];
+            if (minIndex >= 0)
+            {
+                uint offset = j + (offsetStart + i) * width;
+                
+                float outPrev = outsPrev[offset];
+                float vq = outs[offset];
+                float diff = outPrev - vq;
+                
+                tmp += diff * diff;
+            }
+        }}
+    }
+    losses[elem] = tmp;
+}
+
+kernel void vqLayerCAMMax2DHalf(
+     const device half * camLayer,
+     constant uint * pNbChannels,
+     constant uint * pDimensions,
+     constant uint * pNbThreadgroups,
+     constant uint * pNbBatch,
+     device half * camMax,
+     uint2 groupId [[ threadgroup_position_in_grid ]],
+     uint2 threadId [[ thread_position_in_threadgroup ]],
+     uint2 id [[ thread_position_in_grid ]])
+{
+    constexpr uint threadsPerThreadgroup = 64;
+    threadgroup float camShared[threadsPerThreadgroup];
+    
+    uint height, width;
+    uint nbChannels;
+    uint nbThreadgroups;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch &&
+        camLayer && camMax)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        nbChannels = *pNbChannels;
+        nbThreadgroups = *pNbThreadgroups;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint i = id[0] / width;
+    uint j = id[0] % width;
+    
+    if (i * j >= height * width || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    camShared[threadId[0]] = camLayer[j + (elem * height + i) * width];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1)
+    {
+        uint index = threadId[0] + groupId[0] * threadsPerThreadgroup;
+        if (threadId[0] < stride &&
+            (index + stride) < height * width)
+        {
+            camShared[threadId[0]] = max(
+                camShared[threadId[0] + stride],
+                camShared[threadId[0]]
+            );
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    
+    if (threadId[0] == 0)
+    {
+        uint offset = elem * nbThreadgroups + groupId[0];
+        camMax[offset] = camShared[0];
+    }
+}
+
+kernel void vqGrad2DForwardHalf(
+    const device half * outsPrev,
+    const device half * camLayer,
+    const device half * camMax,
+    const device half * weights,
+    constant uint * pNbChannels,
+    constant uint * pDimensions,
+    constant uint * pK,
+    constant float * pMagnitudeCoeff,
+    constant uint * pNbBatch,
+    device half * outs,
+    device int * indices,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint height, width;
+    uint nbChannels;
+    uint K;
+    float magnitudeCoeff;
+    uint nbBatch;
+    
+    if (pNbChannels && pDimensions && pK && pMagnitudeCoeff && pNbBatch &&
+        outsPrev && camLayer && camMax && weights && outs && indices)
+    {
+        width = pDimensions[0];
+        height = pDimensions[1];
+        magnitudeCoeff = *pMagnitudeCoeff;
+        nbChannels = *pNbChannels;
+        K = *pK;
+        nbBatch = *pNbBatch;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint i = id[0] / width;
+    uint j = id[0] % width;
+    
+    if (i * j >= height * width || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float cam = camLayer[j + (elem * height + i) * width];
+    if (cam / camMax[elem] >= magnitudeCoeff)
+    {
+        int minIndex = -1;
+        float minValue = 0.0;
+        for (uint k=0; k<K; k++)
+        {
+            float value = 0.0;
+            for (uint depth=0; depth<nbChannels; depth++)
+            {
+                uint offsetStart = (depth + nbChannels * elem) * height;
+                uint offset = j + (offsetStart + i) * width;
+                
+                uint offsetWeights = depth + nbChannels * k;
+                
+                float outPrev = outsPrev[offset];
+                float vq = weights[offsetWeights];
+                value += pow(outPrev - vq, 2.0);
+            }
+            
+            if (minIndex < 0 || value < minValue)
+            {
+                minValue = value;
+                minIndex = k;
+            }
+        }
+        
+        if (minIndex >= 0)
+        {
+            for (uint depth=0; depth<nbChannels; depth++)
+            {
+                uint offsetStart = (depth + nbChannels * elem) * height;
+                uint offset = j + (offsetStart + i) * width;
+                
+                uint offsetWeights = depth + nbChannels * minIndex;
+                outs[offset] = weights[offsetWeights];
+            }
+            indices[j + (elem * height + i) * width] = minIndex;
+        }
+    }
+    else
+    {
+        indices[j + (elem * height + i) * width] = -1;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/VQSeq.metal b/Sources/GrAIdient/Metal/Kernel/VQSeqFloat.metal
similarity index 98%
rename from Sources/GrAIdient/Metal/Kernel/VQSeq.metal
rename to Sources/GrAIdient/Metal/Kernel/VQSeqFloat.metal
index d2915882..e94667df 100644
--- a/Sources/GrAIdient/Metal/Kernel/VQSeq.metal
+++ b/Sources/GrAIdient/Metal/Kernel/VQSeqFloat.metal
@@ -8,7 +8,7 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void vqSeqForward(
+kernel void vqSeqForwardFloat(
     const device float * outsPrev,
     const device float * weights,
     constant uint * pNbNeurons,
@@ -79,7 +79,7 @@ kernel void vqSeqForward(
     }
 }
 
-kernel void vqSeqBackward(
+kernel void vqSeqBackwardFloat(
     const device float * outsPrev,
     const device float * delta,
     const device float * weights,
@@ -153,7 +153,7 @@ kernel void vqSeqBackward(
     }
 }
 
-kernel void vqSeqBatchDerWeights(
+kernel void vqSeqBatchDerWeightsFloat(
     const device float * outsPrev,
     const device float * weights,
     const device int * indices,
@@ -213,7 +213,7 @@ kernel void vqSeqBatchDerWeights(
     grads[depth + nbNeurons * k] += sum;
 }
 
-kernel void vqSeqDerWeights(
+kernel void vqSeqDerWeightsFloat(
     const device float * outsPrev,
     const device float * weights,
     const device int * indices,
@@ -273,7 +273,7 @@ kernel void vqSeqDerWeights(
     deltaWeights[depth + nbNeurons * k + K * nbNeurons * elem] += sum;
 }
 
-kernel void vqSeqLoss(
+kernel void vqSeqLossFloat(
     const device float * outsPrev,
     const device float * outs,
     const device int * indices,
@@ -323,7 +323,7 @@ kernel void vqSeqLoss(
     losses[elem] = tmp;
 }
 
-kernel void vqLayerCAMMaxSeq(
+kernel void vqLayerCAMMaxSeqFloat(
      const device float * camLayer,
      constant uint * pNbNeurons,
      constant uint * pNbThreadgroups,
@@ -385,7 +385,7 @@ kernel void vqLayerCAMMaxSeq(
     }
 }
 
-kernel void vqGradSeqForward(
+kernel void vqGradSeqForwardFloat(
     const device float * outsPrev,
     const device float * camLayer,
     const device float * camMax,
diff --git a/Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal b/Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal
new file mode 100644
index 00000000..91ebc250
--- /dev/null
+++ b/Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal
@@ -0,0 +1,472 @@
+//
+// VQSeq.metal
+// GrAIdient
+//
+// Created by Jean-François Reboud on 18/06/2023.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void vqSeqForwardHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pK,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    device int * indices,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint K;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pK && pNbBatch && pSequence &&
+        weights && outsPrev && outs && indices)
+    {
+        nbNeurons = *pNbNeurons;
+        K = *pK;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    
+    if (seq >= sequence || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    int minIndex = -1;
+    float minValue = 0.0;
+    for (uint k=0; k<K; k++)
+    {
+        float value = 0.0;
+        for (uint depth=0; depth<nbNeurons; depth++)
+        {
+            uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+            
+            uint offsetWeights = depth + nbNeurons * k;
+            
+            float outPrev = outsPrev[offset];
+            float vq = weights[offsetWeights];
+            value += pow(outPrev - vq, 2.0);
+        }
+        
+        if (minIndex < 0 || value < minValue)
+        {
+            minValue = value;
+            minIndex = k;
+        }
+    }
+    
+    if (minIndex >= 0)
+    {
+        for (uint depth=0; depth<nbNeurons; depth++)
+        {
+            uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+            
+            uint offsetWeights = depth + nbNeurons * minIndex;
+            outs[offset] = weights[offsetWeights];
+        }
+        indices[seq + elem * sequence] = minIndex;
+    }
+}
+
+kernel void vqSeqBackwardHalf(
+    const device half * outsPrev,
+    const device half * delta,
+    const device half * weights,
+    const device int * indices,
+    constant uint * pNbNeurons,
+    constant uint * pK,
+    constant float * pBeta,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    constant uint * pDirty,
+    device half * deltaPrev,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint K;
+    float beta;
+    uint nbBatch;
+    uint sequence;
+    uint dirty;
+    
+    if (pNbNeurons && pK && pBeta && pNbBatch && pSequence && pDirty &&
+        outsPrev && delta && weights && indices && deltaPrev)
+    {
+        nbNeurons = *pNbNeurons;
+        K = *pK;
+        beta = *pBeta;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+        dirty = *pDirty;
+    }
+    else
+        return ;
+    
+    uint depth = id[0];
+    uint elem = id[1] / sequence;
+    uint seq = id[1] % sequence;
+    
+    if (seq * elem >= sequence * nbBatch ||
+        depth >= nbNeurons)
+    {
+        return ;
+    }
+    
+    uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+    
+    int minIndex = indices[seq + elem * sequence];
+    if (minIndex >= 0)
+    {
+        uint offsetWeights = depth + nbNeurons * minIndex;
+        
+        float vq = weights[offsetWeights];
+        float deltaCur = delta[offset];
+        float outPrev = outsPrev[offset];
+        
+        if (dirty)
+        {
+            deltaPrev[offset] = deltaCur;
+        }
+        else
+        {
+            deltaPrev[offset] += deltaCur;
+        }
+        
+        // Commitment term.
+        deltaPrev[offset] += beta / (float)(nbBatch * sequence) *
+            2.0 * (outPrev - vq);
+    }
+    else if (dirty)
+    {
+        deltaPrev[offset] = 0.0;
+    }
+}
+
+kernel void vqSeqBatchDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device int * indices,
+    constant uint * pNbNeurons,
+    constant uint * pK,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * grads,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint K;
+    float coeff;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pK && pCoeff && pNbBatch && pSequence &&
+        outsPrev && weights && indices && grads)
+    {
+        nbNeurons = *pNbNeurons;
+        K = *pK;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint k = id[1];
+    uint depth = id[0];
+    
+    if (depth >= nbNeurons || k >= K)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint elem=0; elem<nbBatch; elem++){
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        int minIndex = indices[seq + elem * sequence];
+        if (minIndex == (int)k)
+        {
+            uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+            
+            uint offsetWeights = depth + nbNeurons * minIndex;
+            
+            float vq = weights[offsetWeights];
+            float outPrev = outsPrev[offset];
+            
+            sum += vq - outPrev;
+        }
+    }}
+    sum *= coeff / (float)(nbBatch * sequence) * 2.0;
+    
+    grads[depth + nbNeurons * k] += sum;
+}
+
+kernel void vqSeqDerWeightsHalf(
+    const device half * outsPrev,
+    const device half * weights,
+    const device int * indices,
+    constant uint * pNbNeurons,
+    constant uint * pK,
+    constant float * pCoeff,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * deltaWeights,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint K;
+    float coeff;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pK && pCoeff && pNbBatch && pSequence &&
+        outsPrev && weights && indices && deltaWeights)
+    {
+        nbNeurons = *pNbNeurons;
+        K = *pK;
+        coeff = *pCoeff;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1] / K;
+    uint k = id[1] % K;
+    uint depth = id[0];
+    
+    if (depth >= nbNeurons || elem * k >= nbBatch * K)
+    {
+        return ;
+    }
+    
+    float sum = 0.0;
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        int minIndex = indices[seq + elem * sequence];
+        if (minIndex == (int)k)
+        {
+            uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem;
+            
+            uint offsetWeights = depth + nbNeurons * minIndex;
+            
+            float vq = weights[offsetWeights];
+            float outPrev = outsPrev[offset];
+            
+            sum += vq - outPrev;
+        }
+    }
+    sum *= coeff / (float)(nbBatch * sequence) * 2.0;
+    
+    deltaWeights[depth + nbNeurons * k + K * nbNeurons * elem] += sum;
+}
+
+kernel void vqSeqLossHalf(
+    const device half * outsPrev,
+    const device half * outs,
+    const device int * indices,
+    constant uint * pNbNeurons,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * losses,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbBatch && pSequence &&
+        outsPrev && outs && indices && losses)
+    {
+        nbNeurons = *pNbNeurons;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id;
+    if (elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float tmp = 0.0;
+    for (uint depth=0; depth<nbNeurons; depth++) {
+    for (uint seq=0; seq<sequence; seq++)
+    {
+        int minIndex = indices[seq + elem * sequence];
+        if (minIndex >= 0)
+        {
+            uint offset =
+                depth + nbNeurons * seq + sequence * nbNeurons * elem;
+            
+            float outPrev = outsPrev[offset];
+            float vq = outs[offset];
+            float diff = outPrev - vq;
+            
+            tmp += diff * diff;
+        }
+    }}
+    losses[elem] = tmp;
+}
+
+kernel void vqLayerCAMMaxSeqHalf(
+     const device half * camLayer,
+     constant uint * pNbNeurons,
+     constant uint * pNbThreadgroups,
+     constant uint * pNbBatch,
+     constant uint * pSequence,
+     device half * camMax,
+     uint2 groupId [[ threadgroup_position_in_grid ]],
+     uint2 threadId [[ thread_position_in_threadgroup ]],
+     uint2 id [[ thread_position_in_grid ]])
+{
+    constexpr uint threadsPerThreadgroup = 64;
+    threadgroup float camShared[threadsPerThreadgroup];
+    
+    uint nbNeurons;
+    uint nbThreadgroups;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pNbThreadgroups && pNbBatch && pSequence &&
+        camLayer && camMax)
+    {
+        nbNeurons = *pNbNeurons;
+        nbThreadgroups = *pNbThreadgroups;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    
+    if (seq >= sequence || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    camShared[threadId[0]] = camLayer[seq + sequence * elem];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    
+    for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1)
+    {
+        uint index = threadId[0] + groupId[0] * threadsPerThreadgroup;
+        if (threadId[0] < stride &&
+            (index + stride) < sequence)
+        {
+            camShared[threadId[0]] = max(
+                camShared[threadId[0] + stride],
+                camShared[threadId[0]]
+            );
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    
+    if (threadId[0] == 0)
+    {
+        uint offset = elem * nbThreadgroups + groupId[0];
+        camMax[offset] = camShared[0];
+    }
+}
+
+kernel void vqGradSeqForwardHalf(
+    const device half * outsPrev,
+    const device half * camLayer,
+    const device half * camMax,
+    const device half * weights,
+    constant uint * pNbNeurons,
+    constant uint * pK,
+    constant float * pMagnitudeCoeff,
+    constant uint * pNbBatch,
+    constant uint * pSequence,
+    device half * outs,
+    device int * indices,
+    uint2 id [[ thread_position_in_grid ]])
+{
+    uint nbNeurons;
+    uint K;
+    float magnitudeCoeff;
+    uint nbBatch;
+    uint sequence;
+    
+    if (pNbNeurons && pK && pMagnitudeCoeff && pNbBatch && pSequence &&
+        outsPrev && camLayer && camMax && weights && outs && indices)
+    {
+        nbNeurons = *pNbNeurons;
+        K = *pK;
+        magnitudeCoeff = *pMagnitudeCoeff;
+        nbBatch = *pNbBatch;
+        sequence = *pSequence;
+    }
+    else
+        return ;
+    
+    uint elem = id[1];
+    uint seq = id[0];
+    
+    if (seq >= sequence || elem >= nbBatch)
+    {
+        return ;
+    }
+    
+    float cam = camLayer[seq + sequence * elem];
+    if (cam / camMax[elem] >= magnitudeCoeff)
+    {
+        int minIndex = -1;
+        float minValue = 0.0;
+        for (uint k=0; k<K; k++)
+        {
+            float value = 0.0;
+            for (uint depth=0; depth<nbNeurons; depth++)
+            {
+                uint offset =
+                    depth + nbNeurons * seq + sequence * nbNeurons * elem;
+                
+                uint offsetWeights = depth + nbNeurons * k;
+                
+                float outPrev = outsPrev[offset];
+                float vq = weights[offsetWeights];
+                value += pow(outPrev - vq, 2.0);
+            }
+            
+            if (minIndex < 0 || value < minValue)
+            {
+                minValue = value;
+                minIndex = k;
+            }
+        }
+        
+        if (minIndex >= 0)
+        {
+            for (uint depth=0; depth<nbNeurons; depth++)
+            {
+                uint offset =
+                    depth + nbNeurons * seq + sequence * nbNeurons * elem;
+                
+                uint offsetWeights = depth + nbNeurons * minIndex;
+                outs[offset] = weights[offsetWeights];
+            }
+            indices[seq + elem * sequence] = minIndex;
+        }
+    }
+    else
+    {
+        indices[seq + elem * sequence] = -1;
+    }
+}
diff --git a/Sources/GrAIdient/Metal/MetalBuffer.swift b/Sources/GrAIdient/Metal/MetalBuffer.swift
index 59057dee..3cb8e882 100644
--- a/Sources/GrAIdient/Metal/MetalBuffer.swift
+++ b/Sources/GrAIdient/Metal/MetalBuffer.swift
@@ -7,6 +7,242 @@
 
 import Metal
 
+/// Wrapper of Metal float buffer.
+public class FloatBuffer
+{
+    /// Number of elements in the buffer.
+    public let nbElems: Int
+    /// GPU device where the buffer is sent.
+    public let deviceID: Int
+    /// Whether to create a shared buffer or a private one.
+    public let shared: Bool
+    
+    /// Float buffer.
+    var _float: MetalBuffer<Float>? = nil
+    /// Float16 buffer.
+    var _float16: MetalBuffer<UInt16>? = nil
+    
+    /// Get Metal buffer.
+    public var metal: MTLBuffer
+    {
+        get {
+            if GrAI.Precision.float16
+            {
+                if _float16 == nil
+                {
+                    if shared
+                    {
+                        _float16 = MetalSharedBuffer<UInt16>(
+                            nbElems, deviceID: deviceID
+                        )
+                    }
+                    else
+                    {
+                        _float16 = MetalPrivateBuffer<UInt16>(
+                            nbElems, deviceID: deviceID
+                        )
+                    }
+                }
+                return _float16!.metal
+            }
+            else
+            {
+                if _float == nil
+                {
+                    if shared
+                    {
+                        _float = MetalSharedBuffer<Float>(
+                            nbElems, deviceID: deviceID
+                        )
+                    }
+                    else
+                    {
+                        _float = MetalPrivateBuffer<Float>(
+                            nbElems, deviceID: deviceID
+                        )
+                    }
+                }
+                return _float!.metal
+            }
+        }
+    }
+    
+    ///
+    /// Create a wrapper of Metal buffer.
+    ///
+    /// - Parameters:
+    ///     - nbElems: The number of elements in the array.
+    ///     - deviceID: GPU ID where the array will be sent.
+    ///     - shared: Whether to create a shared buffer or a private one.
+    ///
+    public init(nbElems: Int, deviceID: Int, shared: Bool = false)
+    {
+        self.deviceID = deviceID
+        self.nbElems = nbElems
+        self.shared = shared
+    }
+    
+    /// Clean the buffers.
+    func reset()
+    {
+        _float = nil
+        _float16 = nil
+    }
+    
+    /// Initialize Metal buffer.
+    public func initialize()
+    {
+        if GrAI.Precision.float16
+        {
+            if _float16 == nil
+            {
+                if shared
+                {
+                    _float16 = MetalSharedBuffer<UInt16>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+                else
+                {
+                    let buffer = MetalPrivateBuffer<UInt16>(
+                        nbElems, deviceID: deviceID
+                    )
+                    _float16 = buffer
+                    _ = buffer.shared
+                }
+            }
+            _float16!.upload()
+        }
+        else
+        {
+            if _float == nil
+            {
+                if shared
+                {
+                    _float = MetalSharedBuffer<Float>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+                else
+                {
+                    let buffer = MetalPrivateBuffer<Float>(
+                        nbElems, deviceID: deviceID
+                    )
+                    _float = buffer
+                    _ = buffer.shared
+                }
+            }
+            _float!.upload()
+        }
+    }
+    
+    ///
+    /// Initialize Metal buffer.
+    ///
+    /// - Parameters:
+    ///     - array: Input array.
+    ///     - start: Start offset.
+    ///
+    public func initialize(
+        array: inout [Float],
+        start: Int = 0)
+    {
+        if GrAI.Precision.float16
+        {
+            if _float16 == nil
+            {
+                if shared
+                {
+                    _float16 = MetalSharedBuffer<UInt16>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+                else
+                {
+                    _float16 = MetalPrivateBuffer<UInt16>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+            }
+            setupHalfBuffer(
+                array: &array,
+                out: _float16!,
+                start: start,
+                nbElems: nbElems,
+                deviceID: deviceID
+            )
+        }
+        else
+        {
+            if _float == nil
+            {
+                if shared
+                {
+                    _float = MetalSharedBuffer<Float>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+                else
+                {
+                    _float =  MetalPrivateBuffer<Float>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+            }
+            setupFloatBuffer(
+                array: &array,
+                out: _float!,
+                start: start,
+                nbElems: nbElems,
+                deviceID: deviceID
+            )
+        }
+    }
+    
+    /// Retrieve Metal buffer content.
+    public func download() -> [Float]
+    {
+        if GrAI.Precision.float16
+        {
+            if _float16 == nil
+            {
+                if shared
+                {
+                    _float16 = MetalSharedBuffer<UInt16>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+                else
+                {
+                    _float16 = MetalPrivateBuffer<UInt16>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+            }
+            return getHalfBuffer(_float16!).array
+        }
+        else
+        {
+            if _float == nil
+            {
+                if shared
+                {
+                    _float = MetalSharedBuffer<Float>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+                else
+                {
+                    _float = MetalPrivateBuffer<Float>(
+                        nbElems, deviceID: deviceID
+                    )
+                }
+            }
+            return [Float](_float!.download())
+        }
+    }
+}
+
 /// Abstract array of elements that can be sent to the GPU.
 public class MetalBuffer<T>
 {
diff --git a/Sources/GrAIdient/Metal/MetalConfig.swift b/Sources/GrAIdient/Metal/MetalConfig.swift
index 8776d4d4..5e76ccce 100644
--- a/Sources/GrAIdient/Metal/MetalConfig.swift
+++ b/Sources/GrAIdient/Metal/MetalConfig.swift
@@ -7,275 +7,548 @@
 
 let CONFIG_KERNELS =
 [
-    "Activation": [
-        "forwardReLU",
-        "backwardReLU",
-        "forwardLeakyReLU",
-        "backwardLeakyReLU",
-        "forwardSoftReLU",
-        "backwardSoftReLU",
-        "forwardSigmoid",
-        "backwardSigmoid",
-        "forwardGELUApprox",
-        "backwardGELUApprox",
-        "forwardGELU",
-        "backwardGELU",
-    ],
-    "Biases": [
-        "reduceBiases",
-    ],
-    "BatchNorm": [
-        "computeBNConvμ",
-        "computeBNConvσ2",
-        "forwardBNConvTraining",
-        "forwardBNConvInference",
-        "backwardWeightsBNConv",
-        "backwardBNConvTraining",
-        "backwardBNConvInference",
-    ],
-    "Convolution": [
-        "convForward",
-        "conv16Forward",
-        "convBackward",
-        "conv16Backward",
-        "convBatchDerWeights",
-        "conv34BatchDerWeights",
-        "convBatchDerBiases",
-        "convDerWeights",
-        "convDerBiases",
-        "convReduceWeights",
-    ],
-    "Deconvolution": [
-        "deconvForward",
-        "deconvBackward",
-        "deconvBatchDerWeights",
-        "deconvDerWeights",
-    ],
-    "FullyConnected": [
-        "flForward",
-        "flBackward",
-        "flBatchDerWeights",
-        "flBatchDerBiases",
-        "flDerWeights",
-        "flDerBiases",
-        "flReduceWeights",
-    ],
-    "FullyConnectedPatch": [
-        "flPatchForward",
-        "flPatchBackward",
-        "flPatchBatchDerWeights",
-        "flPatchBatchDerBiases",
-        "flPatchBatch4DerBiases",
-        "flPatchDerWeights",
-        "flPatchDerBiases",
-        "flPatchReduceWeights",
-    ],
-    "FullyConnectedSeq": [
-        "flSeqForward",
-        "flSeq48Forward",
-        "flSeq4Forward",
-        "flSeqBackward",
-        "flSeq48Backward",
-        "flSeq4Backward",
-        "flSeqBatchDerWeights",
-        "flSeqBatch4DerWeights",
-        "flSeqDerWeights",
-        "flSeqReduceWeights",
-    ],
-    "InstanceNorm": [
-        "computeInstanceNormConvμ",
-        "computeInstanceNormConvσ2",
-        "forwardInstanceNormConv",
-        "forwardAdaIN",
-        "backwardWeightsInstanceNormConv",
-        "backward2AdaIN",
-        "backwardInstanceNormConv",
-        "backward1AdaIN",
-    ],
-    "Layer1D": [
-        "MSE1DLoss",
-        "MSE1DLossDerivative",
-        "linearErrorLoss",
-        "linearErrorLossDerivative",
-        "selectNeurons1DForward",
-        "selectNeurons1DBackward",
-        "concat1DForward",
-        "concat1DBackward",
-        "softmax1DForward",
-        "softmax1DBackward",
-        "dotProduct1DForward",
-        "dotProduct1DBackward",
-        "constant1DForward",
-        "BCE1DLoss",
-        "BCE1DLossDerivative",
-        "BCESigmoid1DLoss",
-        "BCESigmoid1DLossDerivative",
-        "dropout1DForward",
-        "dropout1DBackward",
-    ],
-    "Layer2D": [
-        "avgPoolForward",
-        "avgPoolBackward",
-        "maxPoolForward",
-        "maxPoolBackward",
-        "adaptiveAvgPoolForward1",
-        "adaptiveAvgPoolForward2",
-        "adaptiveAvgPoolBackward1",
-        "adaptiveAvgPoolBackward2",
-        "selectNeurons2DForward",
-        "selectNeurons2DBackward",
-        "IRDFT2RGBForward",
-        "IRDFT2RGBBackward",
-        "decorrelateRGBForward",
-        "decorrelateRGBBackward",
-        "linearScale2DForward",
-        "linearScale2DBackward",
-        "setDataFTFrequences2D",
-        "pad2DForward",
-        "pad2DBackward",
-        "crop2DForward",
-        "crop2DBackward",
-        "resizeBilinearPadForward",
-        "resizeBilinearPadBackward",
-        "rotate2DForward",
-        "rotate2DBackward",
-        "resizeBilinearCropForward",
-        "resizeBilinearCropBackward",
-        "concat02DForward",
-        "concat02DBackward",
-        "concat12DForward",
-        "concat12DBackward",
-        "constant2DForward",
-        "MSE2DLoss",
-        "MSE2DLossDerivative",
-        "selfCorrelate2DForward",
-        "selfCorrelate2DBackward",
-        "normalize12DForward",
-        "normalize12DBackward",
-        "computeSquaredNorm122D",
-        "normalize122DForward",
-        "computeDeltaTmp122D",
-        "normalize122DBackward",
-        "similarBatchError2DLoss",
-        "similarBatchError2DLossDerivative",
-        "similarError2DLossDerivative",
-        "flipHorizontal2DForward",
-        "flipHorizontal2DBackward",
-        "flipVertical2DForward",
-        "flipVertical2DBackward",
-        "colorJitterHSVForward",
-        "BCE2DLoss",
-        "BCE2DLossDerivative",
-        "BCESigmoid2DLoss",
-        "BCESigmoid2DLossDerivative",
-        "layerCAM2DForward",
-    ],
-    "LayerMerge": [
-        "sum1",
-        "sum14",
-        "sum2",
-        "sum24",
-        "multiplyForward",
-        "multiplyBackward",
-    ],
-    "LayerNorm": [
-        "computeLayerNormSeqμ",
-        "computeLayerNormSeqμ4",
-        "computeLayerNormSeqσ2",
-        "computeLayerNormSeqσ24",
-        "forwardLayerNormSeq",
-        "forwardLayerNormSeq4",
-        "backwardWeights1LayerNormSeq",
-        "backwardWeights1LayerNormSeq4",
-        "backwardWeights2LayerNormSeq",
-        "backwardWeights2LayerNormSeq4",
-        "backwardLayerNormSeq",
-        "backwardLayerNormSeq4",
-    ],
-    "LayerSeq": [
-        "avgPoolSeqForward",
-        "avgPoolSeqBackward",
-        "concat1SeqForward",
-        "concat1Seq4Forward",
-        "concat1SeqBackward",
-        "concat1Seq4Backward",
-        "concat2SeqForward",
-        "concat2SeqBackward",
-        "constant12SeqForward",
-        "constant12Seq4Forward",
-        "constant12SeqBackward",
-        "constant12Seq4Backward",
-        "constant2SeqForward",
-        "constant2Seq4Forward",
-        "querySeqForward",
-        "querySeq4Forward",
-        "queryQuerySeqBackward",
-        "queryQuerySeq4Backward",
-        "queryKeySeqBackward",
-        "queryKeySeq4Backward",
-        "querySelfSeqForward",
-        "querySelfSeq4Forward",
-        "querySelfQuerySeqBackward",
-        "querySelfQuerySeq4Backward",
-        "querySelfKeySeqBackward",
-        "querySelfKeySeq4Backward",
-        "softmaxSeqForward",
-        "softmaxSeq4Forward",
-        "softmaxSeqBackward",
-        "softmaxSeq4Backward",
-        "valueSeqForward",
-        "valueSeq4Forward",
-        "valueValueSeqBackward",
-        "valueValueSeq4Backward",
-        "valueScoreSeqBackward",
-        "valueScoreSeq4Backward",
-        "valueSelfSeqForward",
-        "valueSelfSeq4Forward",
-        "valueSelfValueSeqBackward",
-        "valueSelfValueSeq4Backward",
-        "valueSelfScoreSeqBackward",
-        "valueSelfScoreSeq4Backward",
-        "selectSeqForward",
-        "selectSeqBackward",
-        "layerCAMSeqForward",
-    ],
-    "Optimizer": [
-        "clipGradients",
-        "multiplyGradients",
-        "weightsSGD",
-        "weightsMomentum",
-        "weightsAdam",
-        "weightsAMSGrad",
-        "weightsAdamRectified",
-        "weightsAdaBound",
-        "weightsAMSBound",
-    ],
-    "Reduce": [
-        "reduceSum64",
-        "reduceSum",
-        "reduceMax64",
-        "reduceMax",
-    ],
-    "Reset": [
-        "reset"
-    ],
-    "VQ2D": [
-        "vq2DForward",
-        "vq2DBackward",
-        "vq2DBatchDerWeights",
-        "vq2DDerWeights",
-        "vq2DReduceWeights",
-        "vq2DLoss",
-        "vqLayerCAMMax2D",
-        "vqGrad2DForward"
-    ],
-    "VQSeq": [
-        "vqSeqForward",
-        "vqSeqBackward",
-        "vqSeqBatchDerWeights",
-        "vqSeqDerWeights",
-        "vqSeqLoss",
-        "vqLayerCAMMaxSeq",
-        "vqGradSeqForward"
-    ]
+    "ActivationFloat": [
+        "forwardReLUFloat",
+        "backwardReLUFloat",
+        "forwardLeakyReLUFloat",
+        "backwardLeakyReLUFloat",
+        "forwardSoftReLUFloat",
+        "backwardSoftReLUFloat",
+        "forwardSigmoidFloat",
+        "backwardSigmoidFloat",
+        "forwardGELUApproxFloat",
+        "backwardGELUApproxFloat",
+        "forwardGELUFloat",
+        "backwardGELUFloat",
+    ],
+    "ActivationHalf": [
+        "forwardReLUHalf",
+        "backwardReLUHalf",
+        "forwardLeakyReLUHalf",
+        "backwardLeakyReLUHalf",
+        "forwardSoftReLUHalf",
+        "backwardSoftReLUHalf",
+        "forwardSigmoidHalf",
+        "backwardSigmoidHalf",
+        "forwardGELUApproxHalf",
+        "backwardGELUApproxHalf",
+        "forwardGELUHalf",
+        "backwardGELUHalf",
+    ],
+    "BiasesFloat": [
+        "reduceBiasesFloat",
+    ],
+    "BiasesHalf": [
+        "reduceBiasesHalf",
+    ],
+    "BatchNormFloat": [
+        "computeBNConvμFloat",
+        "computeBNConvσ2Float",
+        "forwardBNConvTrainingFloat",
+        "forwardBNConvInferenceFloat",
+        "backwardWeightsBNConvFloat",
+        "backwardBNConvTrainingFloat",
+        "backwardBNConvInferenceFloat",
+    ],
+    "BatchNormHalf": [
+        "computeBNConvμHalf",
+        "computeBNConvσ2Half",
+        "forwardBNConvTrainingHalf",
+        "forwardBNConvInferenceHalf",
+        "backwardWeightsBNConvHalf",
+        "backwardBNConvTrainingHalf",
+        "backwardBNConvInferenceHalf",
+    ],
+    "ConvolutionFloat": [
+        "convForwardFloat",
+        "conv16ForwardFloat",
+        "convBackwardFloat",
+        "conv16BackwardFloat",
+        "convBatchDerWeightsFloat",
+        "conv34BatchDerWeightsFloat",
+        "convBatchDerBiasesFloat",
+        "convDerWeightsFloat",
+        "convDerBiasesFloat",
+        "convReduceWeightsFloat",
+    ],
+    "ConvolutionHalf": [
+        "convForwardHalf",
+        "conv16ForwardHalf",
+        "convBackwardHalf",
+        "conv16BackwardHalf",
+        "convBatchDerWeightsHalf",
+        "conv34BatchDerWeightsHalf",
+        "convBatchDerBiasesHalf",
+        "convDerWeightsHalf",
+        "convDerBiasesHalf",
+        "convReduceWeightsHalf",
+    ],
+    "DeconvolutionFloat": [
+        "deconvForwardFloat",
+        "deconvBackwardFloat",
+        "deconvBatchDerWeightsFloat",
+        "deconvDerWeightsFloat",
+    ],
+    "DeconvolutionHalf": [
+        "deconvForwardHalf",
+        "deconvBackwardHalf",
+        "deconvBatchDerWeightsHalf",
+        "deconvDerWeightsHalf",
+    ],
+    "FullyConnectedFloat": [
+        "flForwardFloat",
+        "flBackwardFloat",
+        "flBatchDerWeightsFloat",
+        "flBatchDerBiasesFloat",
+        "flDerWeightsFloat",
+        "flDerBiasesFloat",
+        "flReduceWeightsFloat",
+    ],
+    "FullyConnectedHalf": [
+        "flForwardHalf",
+        "flBackwardHalf",
+        "flBatchDerWeightsHalf",
+        "flBatchDerBiasesHalf",
+        "flDerWeightsHalf",
+        "flDerBiasesHalf",
+        "flReduceWeightsHalf",
+    ],
+    "FullyConnectedPatchFloat": [
+        "flPatchForwardFloat",
+        "flPatchBackwardFloat",
+        "flPatchBatchDerWeightsFloat",
+        "flPatchBatchDerBiasesFloat",
+        "flPatchBatch4DerBiasesFloat",
+        "flPatchDerWeightsFloat",
+        "flPatchDerBiasesFloat",
+        "flPatchReduceWeightsFloat",
+    ],
+    "FullyConnectedPatchHalf": [
+        "flPatchForwardHalf",
+        "flPatchBackwardHalf",
+        "flPatchBatchDerWeightsHalf",
+        "flPatchBatchDerBiasesHalf",
+        "flPatchBatch4DerBiasesHalf",
+        "flPatchDerWeightsHalf",
+        "flPatchDerBiasesHalf",
+        "flPatchReduceWeightsHalf",
+    ],
+    "FullyConnectedSeqFloat": [
+        "flSeqForwardFloat",
+        "flSeq48ForwardFloat",
+        "flSeq4ForwardFloat",
+        "flSeqBackwardFloat",
+        "flSeq48BackwardFloat",
+        "flSeq4BackwardFloat",
+        "flSeqBatchDerWeightsFloat",
+        "flSeqBatch4DerWeightsFloat",
+        "flSeqDerWeightsFloat",
+        "flSeqReduceWeightsFloat",
+    ],
+    "FullyConnectedSeqHalf": [
+        "flSeqForwardHalf",
+        "flSeq48ForwardHalf",
+        "flSeq4ForwardHalf",
+        "flSeqBackwardHalf",
+        "flSeq48BackwardHalf",
+        "flSeq4BackwardHalf",
+        "flSeqBatchDerWeightsHalf",
+        "flSeqBatch4DerWeightsHalf",
+        "flSeqDerWeightsHalf",
+        "flSeqReduceWeightsHalf",
+    ],
+    "InstanceNormFloat": [
+        "computeInstanceNormConvμFloat",
+        "computeInstanceNormConvσ2Float",
+        "forwardInstanceNormConvFloat",
+        "forwardAdaINFloat",
+        "backwardWeightsInstanceNormConvFloat",
+        "backward2AdaINFloat",
+        "backwardInstanceNormConvFloat",
+        "backward1AdaINFloat",
+    ],
+    "InstanceNormHalf": [
+        "computeInstanceNormConvμHalf",
+        "computeInstanceNormConvσ2Half",
+        "forwardInstanceNormConvHalf",
+        "forwardAdaINHalf",
+        "backwardWeightsInstanceNormConvHalf",
+        "backward2AdaINHalf",
+        "backwardInstanceNormConvHalf",
+        "backward1AdaINHalf",
+    ],
+    "Layer1DFloat": [
+        "MSE1DLossFloat",
+        "MSE1DLossDerivativeFloat",
+        "linearErrorLossFloat",
+        "linearErrorLossDerivativeFloat",
+        "selectNeurons1DForwardFloat",
+        "selectNeurons1DBackwardFloat",
+        "concat1DForwardFloat",
+        "concat1DBackwardFloat",
+        "softmax1DForwardFloat",
+        "softmax1DBackwardFloat",
+        "dotProduct1DForwardFloat",
+        "dotProduct1DBackwardFloat",
+        "constant1DForwardFloat",
+        "BCE1DLossFloat",
+        "BCE1DLossDerivativeFloat",
+        "BCESigmoid1DLossFloat",
+        "BCESigmoid1DLossDerivativeFloat",
+        "dropout1DForwardFloat",
+        "dropout1DBackwardFloat",
+    ],
+    "Layer1DHalf": [
+        "MSE1DLossHalf",
+        "MSE1DLossDerivativeHalf",
+        "linearErrorLossHalf",
+        "linearErrorLossDerivativeHalf",
+        "selectNeurons1DForwardHalf",
+        "selectNeurons1DBackwardHalf",
+        "concat1DForwardHalf",
+        "concat1DBackwardHalf",
+        "softmax1DForwardHalf",
+        "softmax1DBackwardHalf",
+        "dotProduct1DForwardHalf",
+        "dotProduct1DBackwardHalf",
+        "constant1DForwardHalf",
+        "BCE1DLossHalf",
+        "BCE1DLossDerivativeHalf",
+        "BCESigmoid1DLossHalf",
+        "BCESigmoid1DLossDerivativeHalf",
+        "dropout1DForwardHalf",
+        "dropout1DBackwardHalf",
+    ],
+    "Layer2DFloat": [
+        "avgPoolForwardFloat",
+        "avgPoolBackwardFloat",
+        "maxPoolForwardFloat",
+        "maxPoolBackwardFloat",
+        "adaptiveAvgPoolForward1Float",
+        "adaptiveAvgPoolForward2Float",
+        "adaptiveAvgPoolBackward1Float",
+        "adaptiveAvgPoolBackward2Float",
+        "selectNeurons2DForwardFloat",
+        "selectNeurons2DBackwardFloat",
+        "IRDFT2RGBForwardFloat",
+        "IRDFT2RGBBackwardFloat",
+        "decorrelateRGBForwardFloat",
+        "decorrelateRGBBackwardFloat",
+        "linearScale2DForwardFloat",
+        "linearScale2DBackwardFloat",
+        "setDataFTFrequences2DFloat",
+        "pad2DForwardFloat",
+        "pad2DBackwardFloat",
+        "crop2DForwardFloat",
+        "crop2DBackwardFloat",
+        "resizeBilinearPadForwardFloat",
+        "resizeBilinearPadBackwardFloat",
+        "rotate2DForwardFloat",
+        "rotate2DBackwardFloat",
+        "resizeBilinearCropForwardFloat",
+        "resizeBilinearCropBackwardFloat",
+        "concat02DForwardFloat",
+        "concat02DBackwardFloat",
+        "concat12DForwardFloat",
+        "concat12DBackwardFloat",
+        "constant2DForwardFloat",
+        "MSE2DLossFloat",
+        "MSE2DLossDerivativeFloat",
+        "selfCorrelate2DForwardFloat",
+        "selfCorrelate2DBackwardFloat",
+        "normalize12DForwardFloat",
+        "normalize12DBackwardFloat",
+        "computeSquaredNorm122DFloat",
+        "normalize122DForwardFloat",
+        "computeDeltaTmp122DFloat",
+        "normalize122DBackwardFloat",
+        "similarBatchError2DLossFloat",
+        "similarBatchError2DLossDerivativeFloat",
+        "similarError2DLossDerivativeFloat",
+        "flipHorizontal2DForwardFloat",
+        "flipHorizontal2DBackwardFloat",
+        "flipVertical2DForwardFloat",
+        "flipVertical2DBackwardFloat",
+        "colorJitterHSVForwardFloat",
+        "BCE2DLossFloat",
+        "BCE2DLossDerivativeFloat",
+        "BCESigmoid2DLossFloat",
+        "BCESigmoid2DLossDerivativeFloat",
+        "layerCAM2DForwardFloat",
+    ],
+    "Layer2DHalf": [
+        "avgPoolForwardHalf",
+        "avgPoolBackwardHalf",
+        "maxPoolForwardHalf",
+        "maxPoolBackwardHalf",
+        "adaptiveAvgPoolForward1Half",
+        "adaptiveAvgPoolForward2Half",
+        "adaptiveAvgPoolBackward1Half",
+        "adaptiveAvgPoolBackward2Half",
+        "selectNeurons2DForwardHalf",
+        "selectNeurons2DBackwardHalf",
+        "IRDFT2RGBForwardHalf",
+        "IRDFT2RGBBackwardHalf",
+        "decorrelateRGBForwardHalf",
+        "decorrelateRGBBackwardHalf",
+        "linearScale2DForwardHalf",
+        "linearScale2DBackwardHalf",
+        "setDataFTFrequences2DHalf",
+        "pad2DForwardHalf",
+        "pad2DBackwardHalf",
+        "crop2DForwardHalf",
+        "crop2DBackwardHalf",
+        "resizeBilinearPadForwardHalf",
+        "resizeBilinearPadBackwardHalf",
+        "rotate2DForwardHalf",
+        "rotate2DBackwardHalf",
+        "resizeBilinearCropForwardHalf",
+        "resizeBilinearCropBackwardHalf",
+        "concat02DForwardHalf",
+        "concat02DBackwardHalf",
+        "concat12DForwardHalf",
+        "concat12DBackwardHalf",
+        "constant2DForwardHalf",
+        "MSE2DLossHalf",
+        "MSE2DLossDerivativeHalf",
+        "selfCorrelate2DForwardHalf",
+        "selfCorrelate2DBackwardHalf",
+        "normalize12DForwardHalf",
+        "normalize12DBackwardHalf",
+        "computeSquaredNorm122DHalf",
+        "normalize122DForwardHalf",
+        "computeDeltaTmp122DHalf",
+        "normalize122DBackwardHalf",
+        "similarBatchError2DLossHalf",
+        "similarBatchError2DLossDerivativeHalf",
+        "similarError2DLossDerivativeHalf",
+        "flipHorizontal2DForwardHalf",
+        "flipHorizontal2DBackwardHalf",
+        "flipVertical2DForwardHalf",
+        "flipVertical2DBackwardHalf",
+        "colorJitterHSVForwardHalf",
+        "BCE2DLossHalf",
+        "BCE2DLossDerivativeHalf",
+        "BCESigmoid2DLossHalf",
+        "BCESigmoid2DLossDerivativeHalf",
+        "layerCAM2DForwardHalf",
+    ],
+    "LayerMergeFloat": [
+        "sum1Float",
+        "sum14Float",
+        "sum2Float",
+        "sum24Float",
+        "multiplyForwardFloat",
+        "multiplyBackwardFloat",
+    ],
+    "LayerMergeHalf": [
+        "sum1Half",
+        "sum14Half",
+        "sum2Half",
+        "sum24Half",
+        "multiplyForwardHalf",
+        "multiplyBackwardHalf",
+    ],
+    "LayerNormFloat": [
+        "computeLayerNormSeqμFloat",
+        "computeLayerNormSeqμ4Float",
+        "computeLayerNormSeqσ2Float",
+        "computeLayerNormSeqσ24Float",
+        "forwardLayerNormSeqFloat",
+        "forwardLayerNormSeq4Float",
+        "backwardWeights1LayerNormSeqFloat",
+        "backwardWeights1LayerNormSeq4Float",
+        "backwardWeights2LayerNormSeqFloat",
+        "backwardWeights2LayerNormSeq4Float",
+        "backwardLayerNormSeqFloat",
+        "backwardLayerNormSeq4Float",
+    ],
+    "LayerNormHalf": [
+        "computeLayerNormSeqμHalf",
+        "computeLayerNormSeqμ4Half",
+        "computeLayerNormSeqσ2Half",
+        "computeLayerNormSeqσ24Half",
+        "forwardLayerNormSeqHalf",
+        "forwardLayerNormSeq4Half",
+        "backwardWeights1LayerNormSeqHalf",
+        "backwardWeights1LayerNormSeq4Half",
+        "backwardWeights2LayerNormSeqHalf",
+        "backwardWeights2LayerNormSeq4Half",
+        "backwardLayerNormSeqHalf",
+        "backwardLayerNormSeq4Half",
+    ],
+    "LayerSeqFloat": [
+        "avgPoolSeqForwardFloat",
+        "avgPoolSeqBackwardFloat",
+        "concat1SeqForwardFloat",
+        "concat1Seq4ForwardFloat",
+        "concat1SeqBackwardFloat",
+        "concat1Seq4BackwardFloat",
+        "concat2SeqForwardFloat",
+        "concat2SeqBackwardFloat",
+        "constant12SeqForwardFloat",
+        "constant12Seq4ForwardFloat",
+        "constant12SeqBackwardFloat",
+        "constant12Seq4BackwardFloat",
+        "constant2SeqForwardFloat",
+        "constant2Seq4ForwardFloat",
+        "querySeqForwardFloat",
+        "querySeq4ForwardFloat",
+        "queryQuerySeqBackwardFloat",
+        "queryQuerySeq4BackwardFloat",
+        "queryKeySeqBackwardFloat",
+        "queryKeySeq4BackwardFloat",
+        "querySelfSeqForwardFloat",
+        "querySelfSeq4ForwardFloat",
+        "querySelfQuerySeqBackwardFloat",
+        "querySelfQuerySeq4BackwardFloat",
+        "querySelfKeySeqBackwardFloat",
+        "querySelfKeySeq4BackwardFloat",
+        "softmaxSeqForwardFloat",
+        "softmaxSeq4ForwardFloat",
+        "softmaxSeqBackwardFloat",
+        "softmaxSeq4BackwardFloat",
+        "valueSeqForwardFloat",
+        "valueSeq4ForwardFloat",
+        "valueValueSeqBackwardFloat",
+        "valueValueSeq4BackwardFloat",
+        "valueScoreSeqBackwardFloat",
+        "valueScoreSeq4BackwardFloat",
+        "valueSelfSeqForwardFloat",
+        "valueSelfSeq4ForwardFloat",
+        "valueSelfValueSeqBackwardFloat",
+        "valueSelfValueSeq4BackwardFloat",
+        "valueSelfScoreSeqBackwardFloat",
+        "valueSelfScoreSeq4BackwardFloat",
+        "selectSeqForwardFloat",
+        "selectSeqBackwardFloat",
+        "layerCAMSeqForwardFloat",
+    ],
+    "LayerSeqHalf": [
+        "avgPoolSeqForwardHalf",
+        "avgPoolSeqBackwardHalf",
+        "concat1SeqForwardHalf",
+        "concat1Seq4ForwardHalf",
+        "concat1SeqBackwardHalf",
+        "concat1Seq4BackwardHalf",
+        "concat2SeqForwardHalf",
+        "concat2SeqBackwardHalf",
+        "constant12SeqForwardHalf",
+        "constant12Seq4ForwardHalf",
+        "constant12SeqBackwardHalf",
+        "constant12Seq4BackwardHalf",
+        "constant2SeqForwardHalf",
+        "constant2Seq4ForwardHalf",
+        "querySeqForwardHalf",
+        "querySeq4ForwardHalf",
+        "queryQuerySeqBackwardHalf",
+        "queryQuerySeq4BackwardHalf",
+        "queryKeySeqBackwardHalf",
+        "queryKeySeq4BackwardHalf",
+        "querySelfSeqForwardHalf",
+        "querySelfSeq4ForwardHalf",
+        "querySelfQuerySeqBackwardHalf",
+        "querySelfQuerySeq4BackwardHalf",
+        "querySelfKeySeqBackwardHalf",
+        "querySelfKeySeq4BackwardHalf",
+        "softmaxSeqForwardHalf",
+        "softmaxSeq4ForwardHalf",
+        "softmaxSeqBackwardHalf",
+        "softmaxSeq4BackwardHalf",
+        "valueSeqForwardHalf",
+        "valueSeq4ForwardHalf",
+        "valueValueSeqBackwardHalf",
+        "valueValueSeq4BackwardHalf",
+        "valueScoreSeqBackwardHalf",
+        "valueScoreSeq4BackwardHalf",
+        "valueSelfSeqForwardHalf",
+        "valueSelfSeq4ForwardHalf",
+        "valueSelfValueSeqBackwardHalf",
+        "valueSelfValueSeq4BackwardHalf",
+        "valueSelfScoreSeqBackwardHalf",
+        "valueSelfScoreSeq4BackwardHalf",
+        "selectSeqForwardHalf",
+        "selectSeqBackwardHalf",
+        "layerCAMSeqForwardHalf",
+    ],
+    "OptimizerFloat": [
+        "clipGradientsFloat",
+        "multiplyGradientsFloat",
+        "weightsSGDFloat",
+        "weightsMomentumFloat",
+        "weightsAdamFloat",
+        "weightsAMSGradFloat",
+        "weightsAdamRectifiedFloat",
+        "weightsAdaBoundFloat",
+        "weightsAMSBoundFloat",
+    ],
+    "OptimizerHalf": [
+        "clipGradientsHalf",
+        "multiplyGradientsHalf",
+        "weightsSGDHalf",
+        "weightsMomentumHalf",
+        "weightsAdamHalf",
+        "weightsAMSGradHalf",
+        "weightsAdamRectifiedHalf",
+        "weightsAdaBoundHalf",
+        "weightsAMSBoundHalf",
+    ],
+    "ReduceFloat": [
+        "reduceSum64Float",
+        "reduceSumFloat",
+        "reduceMax64Float",
+        "reduceMaxFloat",
+    ],
+    "ReduceHalf": [
+        "reduceSum64Half",
+        "reduceSumHalf",
+        "reduceMax64Half",
+        "reduceMaxHalf",
+    ],
+    "ResetFloat": [
+        "resetFloat",
+    ],
+    "ResetHalf": [
+        "resetHalf",
+        "convertFloat2Half",
+        "convertHalf2Float",
+    ],
+    "VQ2DFloat": [
+        "vq2DForwardFloat",
+        "vq2DBackwardFloat",
+        "vq2DBatchDerWeightsFloat",
+        "vq2DDerWeightsFloat",
+        "vq2DReduceWeightsFloat",
+        "vq2DLossFloat",
+        "vqLayerCAMMax2DFloat",
+        "vqGrad2DForwardFloat",
+    ],
+    "VQ2DHalf": [
+        "vq2DForwardHalf",
+        "vq2DBackwardHalf",
+        "vq2DBatchDerWeightsHalf",
+        "vq2DDerWeightsHalf",
+        "vq2DReduceWeightsHalf",
+        "vq2DLossHalf",
+        "vqLayerCAMMax2DHalf",
+        "vqGrad2DForwardHalf",
+    ],
+    "VQSeqFloat": [
+        "vqSeqForwardFloat",
+        "vqSeqBackwardFloat",
+        "vqSeqBatchDerWeightsFloat",
+        "vqSeqDerWeightsFloat",
+        "vqSeqLossFloat",
+        "vqLayerCAMMaxSeqFloat",
+        "vqGradSeqForwardFloat",
+    ],
+    "VQSeqHalf": [
+        "vqSeqForwardHalf",
+        "vqSeqBackwardHalf",
+        "vqSeqBatchDerWeightsHalf",
+        "vqSeqDerWeightsHalf",
+        "vqSeqLossHalf",
+        "vqLayerCAMMaxSeqHalf",
+        "vqGradSeqForwardHalf",
+    ],
 ]
diff --git a/Sources/GrAIdient/Metal/MetalKernel.swift b/Sources/GrAIdient/Metal/MetalKernel.swift
index f3ebd173..d3a834af 100644
--- a/Sources/GrAIdient/Metal/MetalKernel.swift
+++ b/Sources/GrAIdient/Metal/MetalKernel.swift
@@ -704,11 +704,31 @@ private class MetalDevice
     ///
     func createCommand(_ pipeline: String) -> MetalCommand
     {
-        if let pipelineTmp = _pipelines[pipeline]
+        var pipelineFullName = pipeline
+        if GrAI.Precision.float16
+        {
+            pipelineFullName += "Half"
+        }
+        else
+        {
+            pipelineFullName += "Float"
+        }
+        
+        if let pipelineTmp = _pipelines[pipelineFullName]
         {
             return MetalCommand(queue: _queue, pipeline: pipelineTmp)
         }
-        fatalError("Could not find pipeline: \(pipeline).")
+        else if let pipelineTmp = _pipelines[pipeline]
+        {
+            return MetalCommand(queue: _queue, pipeline: pipelineTmp)
+        }
+        else
+        {
+            fatalError(
+                "Could not find pipeline: " +
+                "\(pipelineFullName), nor \(pipeline)."
+            )
+        }
     }
     
     ///
diff --git a/Sources/GrAIdient/Utils/Buffer.swift b/Sources/GrAIdient/Utils/Buffer.swift
index 37489c4d..05b2e6dd 100644
--- a/Sources/GrAIdient/Utils/Buffer.swift
+++ b/Sources/GrAIdient/Utils/Buffer.swift
@@ -9,46 +9,173 @@ import Foundation
 import Accelerate
 
 ///
-/// Copy array to buffer.
+/// Copy, convert and upload Float array to Half buffer.
 ///
 /// - Parameters:
-///     - array: input array
-///     - buffer: output buffer
-///     - start: start index in `array`
+///     - array: Input array.
+///     - out: Output buffer.
+///     - start: Start index in `array`.
 ///     - nbElems: Number of elements to copy.
+///     - deviceID: GPU device.
 ///
-func copyFloatArrayToBuffer(
+public func setupHalfBuffer(
     array: inout [Float],
-    buffer: UnsafeMutableBufferPointer<Float>,
+    out: MetalBuffer<UInt16>,
     start: Int,
-    nbElems: Int)
+    nbElems: Int,
+    deviceID: Int)
+{
+    let temp = MetalSharedBuffer<Float>(nbElems, deviceID: deviceID)
+    copyArrayToBuffer(
+        array: &array,
+        buffer: temp.buffer,
+        start: start,
+        nbElems: nbElems
+    )
+    
+    temp.upload()
+    convertFloat2Half(
+        inBuffer: temp,
+        outBuffer: out,
+        nbElems: nbElems,
+        deviceID: deviceID
+    )
+    
+    // Make sure operation has ended because returning.
+    _ = out.download()
+}
+
+///
+/// Copy, convert and upload Float array to Half buffer.
+///
+/// - Parameters:
+///     - array: Input array.
+///     - out: Output buffer.
+///     - start: Start index in `array`.
+///     - nbElems: Number of elements to copy.
+///     - deviceID: GPU device.
+///
+public func setupFloatBuffer(
+    array: inout [Float],
+    out: MetalBuffer<Float>,
+    start: Int,
+    nbElems: Int,
+    deviceID: Int)
 {
-    if #available(macOS 13.0, *)
+    if let out_s = out as? MetalSharedBuffer<Float>
     {
         copyArrayToBuffer(
             array: &array,
-            buffer: buffer,
-            start: start, 
+            buffer: out_s.buffer,
+            start: start,
             nbElems: nbElems
         )
     }
     else
     {
-        fatalError()
+        let out_p = out as! MetalPrivateBuffer<Float>
+        copyArrayToBuffer(
+            array: &array,
+            buffer: out_p.shared.buffer,
+            start: start,
+            nbElems: nbElems
+        )
     }
+    out.upload()
+}
+
+///
+/// Convert Half buffer to Float buffer and download content.
+///
+/// - Parameter buffer: Input buffer.
+///
+/// - Returns: Float buffer.
+///
+public func getHalfBuffer(
+    _ buffer: MetalBuffer<UInt16>
+) -> MetalSharedBuffer<Float>
+{
+    let temp = MetalSharedBuffer<Float>(
+        buffer.nbElems,
+        deviceID: buffer.deviceID
+    )
+    convertHalf2Float(
+        inBuffer: buffer,
+        outBuffer: temp,
+        nbElems: buffer.nbElems,
+        deviceID: buffer.deviceID
+    )
+    
+    _ = temp.download()
+    return temp
+}
+
+///
+/// Convert a Float32 buffer into a Float16 buffer.
+///
+/// - Parameters:
+///     - inBuffer: Input buffer.
+///     - outBuffer: Output buffer.
+///     - nbElems: Number of elements.
+///     - deviceID: GPU device.
+///
+public func convertFloat2Half(
+    inBuffer: MetalBuffer<Float>,
+    outBuffer: MetalBuffer<UInt16>,
+    nbElems: Int,
+    deviceID: Int)
+{
+    let pNbElems: [UInt32] = [UInt32(nbElems)]
+    
+    let command = MetalKernel.get.createCommand(
+        "convertFloat2Half", deviceID: deviceID
+    )
+    command.setBuffer(inBuffer.metal, atIndex: 0)
+    command.setBytes(pNbElems, atIndex: 1)
+    command.setBuffer(outBuffer.metal, atIndex: 2)
+    
+    command.dispatchThreads(nbElems)
+    command.enqueue()
+}
+
+///
+/// Convert a Float16 into a Float32 buffer.
+///
+/// - Parameters:
+///     - inBuffer: Input buffer.
+///     - outBuffer: Output buffer.
+///     - nbElems: Number of elements.
+///     - deviceID: GPU device.
+///
+public func convertHalf2Float(
+    inBuffer: MetalBuffer<UInt16>,
+    outBuffer: MetalBuffer<Float>,
+    nbElems: Int,
+    deviceID: Int)
+{
+    let pNbElems: [UInt32] = [UInt32(nbElems)]
+    
+    let command = MetalKernel.get.createCommand(
+        "convertHalf2Float", deviceID: deviceID
+    )
+    command.setBuffer(inBuffer.metal, atIndex: 0)
+    command.setBytes(pNbElems, atIndex: 1)
+    command.setBuffer(outBuffer.metal, atIndex: 2)
+    
+    command.dispatchThreads(nbElems)
+    command.enqueue()
 }
 
-@available(macOS 13.0, *)
 ///
 /// Copy array to buffer.
 ///
 /// - Parameters:
-///     - array: input array
-///     - buffer: output buffer
-///     - start: start index in `array`
+///     - array: Input array.
+///     - buffer: Output buffer.
+///     - start: Start index in `array`.
 ///     - nbElems: Number of elements to copy.
 ///
-func copyArrayToBuffer<T: BNNSScalar>(
+public func copyArrayToBuffer<T: BNNSScalar>(
     array: inout [T],
     buffer: UnsafeMutableBufferPointer<T>,
     start: Int,
diff --git a/Sources/GrAIdient/Utils/Image.swift b/Sources/GrAIdient/Utils/Image.swift
index 9c24c81d..bab6b6a6 100644
--- a/Sources/GrAIdient/Utils/Image.swift
+++ b/Sources/GrAIdient/Utils/Image.swift
@@ -44,14 +44,14 @@ public class Image
     /// the output buffer in the .Neuron format.
     ///
     /// - Parameters:
-    ///     - metalBuffer: Buffer of images.
+    ///     - imagesURL: Images on the disk.
+    ///     - imagesBuffer: Buffer of images.
     ///     - width: Width of the images.
     ///     - height: Height of the images.
-    /// - Returns: The list of images as list of pixels.
     ///
     public static func loadImages(
         imagesURL: [URL],
-        imagesBuffer: MetalBuffer<Float>,
+        imagesBuffer: FloatBuffer,
         width: Int,
         height: Int) throws
     {
@@ -61,7 +61,13 @@ public class Image
             throw ImageError.MissingSpace
         }
         
-        let bufferPtr = imagesBuffer.download()
+        _ = imagesBuffer.download()
+        
+        var buffer = [Float](
+            repeating: 0.0,
+            count: batchSize * 3 * height * width
+        )
+        
         for (elem, imageURL) in imagesURL.enumerated()
         {
             let image = NSImage(contentsOfFile: imageURL.path)!
@@ -79,12 +85,12 @@ public class Image
                     let offsetStart = (depth + 3 * elem) * height
                     let offsetSet = j + (offsetStart + i) * width
                     
-                    bufferPtr[offsetSet] =
+                    buffer[offsetSet] =
                         Float(pixels[3 * offsetGet + depth]) / 255.0
                 }
             }}
         }
-        imagesBuffer.upload()
+        imagesBuffer.initialize(array: &buffer)
     }
     
     ///
@@ -100,18 +106,18 @@ public class Image
     /// - Returns: The list of images as list of pixels.
     ///
     public static func extractPixels(
-        _ metalBuffer: MetalBuffer<Float>,
+        _ metalBuffer: FloatBuffer,
         width: Int,
         height: Int) -> [[UInt8]]
     {
-        let bufferPtr = metalBuffer.download()
+        let buffer = metalBuffer.download()
         let nbImages = metalBuffer.nbElems / (width * height * 3)
         
         var images = [[Float]]()
         for i in 0..<nbImages
         {
             images.append([Float](
-                bufferPtr[i * 3 * height * width..<(i+1) * 3 * height * width]
+                buffer[i * 3 * height * width..<(i+1) * 3 * height * width]
             ))
         }
         return toRGB(toPixel(images), width: width, height: height)
diff --git a/Tests/GrAIExamples/AutoEncoderExample.swift b/Tests/GrAIExamples/AutoEncoderExample.swift
index f2daa9a3..ecdb0998 100644
--- a/Tests/GrAIExamples/AutoEncoderExample.swift
+++ b/Tests/GrAIExamples/AutoEncoderExample.swift
@@ -23,7 +23,9 @@ final class AutoEncoderExample: XCTestCase
     {
         setPythonLib()
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     ///
diff --git a/Tests/GrAIExamples/AutoEncoderTests.swift b/Tests/GrAIExamples/AutoEncoderTests.swift
index f1850f90..6a226c58 100644
--- a/Tests/GrAIExamples/AutoEncoderTests.swift
+++ b/Tests/GrAIExamples/AutoEncoderTests.swift
@@ -26,7 +26,9 @@ final class AutoEncoderTests: XCTestCase
     {
         setPythonLib()
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     /// Compare loss in the training of a simple auto encoder model in GrAIdient and PyTorch.
diff --git a/Tests/GrAIExamples/Base/setup.py b/Tests/GrAIExamples/Base/setup.py
index 6cffcd2d..74cf8597 100644
--- a/Tests/GrAIExamples/Base/setup.py
+++ b/Tests/GrAIExamples/Base/setup.py
@@ -8,7 +8,7 @@
     license='MIT',
     install_requires=[
         "torch==1.13.1",
-        "torchvision==0.11.2",
+        "torchvision==0.14.1",
         "numpy==1.23.1",
         "opencv-python==4.6.0.66",
         "sentencepiece==0.2.0",
diff --git a/Tests/GrAIExamples/TransformerBenchmark.swift b/Tests/GrAIExamples/TransformerBenchmark.swift
index 3265c401..65565c23 100644
--- a/Tests/GrAIExamples/TransformerBenchmark.swift
+++ b/Tests/GrAIExamples/TransformerBenchmark.swift
@@ -21,7 +21,9 @@ final class TransformerBenchmark: XCTestCase
     {
         setPythonLib()
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     ///
@@ -238,8 +240,10 @@ final class TransformerBenchmark: XCTestCase
         let lastLayer: MSE1D = transformer.layers.last as! MSE1D
         
         // Initialize the ground truth once and for all.
-        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
-        let gtBuffer = groundTruth.buffer
+        let groundTruth = FloatBuffer(
+            nbElems: _batchSize, deviceID: 0, shared: true
+        )
+        var gtBuffer = [Float](repeating: 0.0, count: _batchSize)
         for elem in 0..<_batchSize / 2
         {
             gtBuffer[elem] = 0.0
@@ -248,18 +252,20 @@ final class TransformerBenchmark: XCTestCase
         {
             gtBuffer[elem] = 1.0
         }
-        groundTruth.upload()
+        groundTruth.initialize(array: &gtBuffer)
         
         // Initialize data once and for all.
-        let data = MetalPrivateBuffer<Float>(
-            _batchSize * 3 * _size * _size, deviceID: 0
+        let data = FloatBuffer(
+            nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true
+        )
+        var dataBuffer = [Float](
+            repeating: 0.0, count: _batchSize * 3 * _size * _size
         )
-        let dataBuffer = data.shared.buffer
         for i in 0..<_batchSize * 3 * _size * _size
         {
             dataBuffer[i] = Float.random(in: -1..<1)
         }
-        data.upload()
+        data.initialize(array: &dataBuffer)
         
         let nbEpochs = 2
         let nbSteps = 20
@@ -349,8 +355,10 @@ final class TransformerBenchmark: XCTestCase
         let lastLayer: MSE1D = transformer.layers.last as! MSE1D
         
         // Initialize the ground truth once and for all.
-        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
-        let gtBuffer = groundTruth.buffer
+        let groundTruth = FloatBuffer(
+            nbElems: _batchSize, deviceID: 0, shared: true
+        )
+        var gtBuffer = [Float](repeating: 0.0, count: _batchSize)
         for elem in 0..<_batchSize / 2
         {
             gtBuffer[elem] = 0.0
@@ -359,18 +367,20 @@ final class TransformerBenchmark: XCTestCase
         {
             gtBuffer[elem] = 1.0
         }
-        groundTruth.upload()
+        groundTruth.initialize(array: &gtBuffer)
         
         // Initialize data once and for all.
-        let data = MetalPrivateBuffer<Float>(
-            _batchSize * 3 * _size * _size, deviceID: 0
+        let data = FloatBuffer(
+            nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true
+        )
+        var dataBuffer = [Float](
+            repeating: 0.0, count: _batchSize * 3 * _size * _size
         )
-        let dataBuffer = data.shared.buffer
         for i in 0..<_batchSize * 3 * _size * _size
         {
             dataBuffer[i] = Float.random(in: -1..<1)
         }
-        data.upload()
+        data.initialize(array: &dataBuffer)
         
         let nbEpochs = 2
         let nbSteps = 20
diff --git a/Tests/GrAIExamples/TransformerExample.swift b/Tests/GrAIExamples/TransformerExample.swift
index 5d39e2be..bd2a08be 100644
--- a/Tests/GrAIExamples/TransformerExample.swift
+++ b/Tests/GrAIExamples/TransformerExample.swift
@@ -29,7 +29,9 @@ final class TransformerExample: XCTestCase
     {
         setPythonLib()
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     ///
@@ -287,17 +289,19 @@ final class TransformerExample: XCTestCase
         let lastLayer: MSE1D = transformer.layers.last as! MSE1D
         
         // Initialize the ground truth once and for all.
-        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
-        let buffer = groundTruth.buffer
+        let groundTruth = FloatBuffer(
+            nbElems: _batchSize, deviceID: 0, shared: true
+        )
+        var gtBuffer = [Float](repeating: 0.0, count: _batchSize)
         for elem in 0..<_batchSize / 2
         {
-            buffer[elem] = 0.0
+            gtBuffer[elem] = 0.0
         }
         for elem in _batchSize / 2..<_batchSize
         {
-            buffer[elem] = 1.0
+            gtBuffer[elem] = 1.0
         }
-        groundTruth.upload()
+        groundTruth.initialize(array: &gtBuffer)
         
         let nbEpochs = 2
         for epoch in 0..<nbEpochs
diff --git a/Tests/GrAIExamples/VGGBenchmark.swift b/Tests/GrAIExamples/VGGBenchmark.swift
index 0a3bbd99..b4bac742 100644
--- a/Tests/GrAIExamples/VGGBenchmark.swift
+++ b/Tests/GrAIExamples/VGGBenchmark.swift
@@ -21,7 +21,9 @@ final class VGGBenchmark: XCTestCase
     {
         setPythonLib()
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     ///
@@ -225,28 +227,32 @@ final class VGGBenchmark: XCTestCase
         let lastLayer: MSE1D = vgg.layers.last as! MSE1D
         
         // Initialize the ground truth once and for all.
-        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
-        let buffer = groundTruth.buffer
+        let groundTruth = FloatBuffer(
+            nbElems: _batchSize, deviceID: 0, shared: true
+        )
+        var gtBuffer = [Float](repeating: 0.0, count: _batchSize)
         for elem in 0..<_batchSize / 2
         {
-            buffer[elem] = 0.0
+            gtBuffer[elem] = 0.0
         }
         for elem in _batchSize / 2..<_batchSize
         {
-            buffer[elem] = 1.0
+            gtBuffer[elem] = 1.0
         }
-        groundTruth.upload()
+        groundTruth.initialize(array: &gtBuffer)
         
         // Initialize data once and for all.
-        let data = MetalPrivateBuffer<Float>(
-            _batchSize * 3 * _size * _size, deviceID: 0
+        let data = FloatBuffer(
+            nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true
+        )
+        var dataBuffer = [Float](
+            repeating: 0.0, count: _batchSize * 3 * _size * _size
         )
-        let dataBuffer = data.shared.buffer
         for i in 0..<_batchSize * 3 * _size * _size
         {
             dataBuffer[i] = Float.random(in: -1..<1)
         }
-        data.upload()
+        data.initialize(array: &dataBuffer)
         
         let nbEpochs = 1
         let nbSteps = 20
@@ -328,8 +334,10 @@ final class VGGBenchmark: XCTestCase
         let lastLayer: MSE1D = vgg.layers.last as! MSE1D
         
         // Initialize the ground truth once and for all.
-        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
-        let gtBuffer = groundTruth.buffer
+        let groundTruth = FloatBuffer(
+            nbElems: _batchSize, deviceID: 0, shared: true
+        )
+        var gtBuffer = [Float](repeating: 0.0, count: _batchSize)
         for elem in 0..<_batchSize / 2
         {
             gtBuffer[elem] = 0.0
@@ -338,18 +346,20 @@ final class VGGBenchmark: XCTestCase
         {
             gtBuffer[elem] = 1.0
         }
-        groundTruth.upload()
+        groundTruth.initialize(array: &gtBuffer)
         
         // Initialize data once and for all.
-        let data = MetalPrivateBuffer<Float>(
-            _batchSize * 3 * _size * _size, deviceID: 0
+        let data = FloatBuffer(
+            nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true
+        )
+        var dataBuffer = [Float](
+            repeating: 0.0, count: _batchSize * 3 * _size * _size
         )
-        let dataBuffer = data.shared.buffer
         for i in 0..<_batchSize * 3 * _size * _size
         {
             dataBuffer[i] = Float.random(in: -1..<1)
         }
-        data.upload()
+        data.initialize(array: &dataBuffer)
         
         let nbEpochs = 2
         let nbSteps = 20
diff --git a/Tests/GrAIExamples/VGGExample.swift b/Tests/GrAIExamples/VGGExample.swift
index 685967d3..d36fad54 100644
--- a/Tests/GrAIExamples/VGGExample.swift
+++ b/Tests/GrAIExamples/VGGExample.swift
@@ -29,7 +29,9 @@ final class VGGExample: XCTestCase
     {
         setPythonLib()
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     ///
@@ -396,17 +398,19 @@ final class VGGExample: XCTestCase
         let lastLayer: MSE1D = vgg.layers.last as! MSE1D
         
         // Initialize the ground truth once and for all.
-        let groundTruth = MetalSharedBuffer<Float>(_batchSize, deviceID: 0)
-        let buffer = groundTruth.buffer
+        let groundTruth = FloatBuffer(
+            nbElems: _batchSize, deviceID: 0, shared: true
+        )
+        var gtBuffer = [Float](repeating: 0.0, count: _batchSize)
         for elem in 0..<_batchSize / 2
         {
-            buffer[elem] = 0.0
+            gtBuffer[elem] = 0.0
         }
         for elem in _batchSize / 2..<_batchSize
         {
-            buffer[elem] = 1.0
+            gtBuffer[elem] = 1.0
         }
-        MetalKernel.get.upload([groundTruth])
+        groundTruth.initialize(array: &gtBuffer)
         
         let nbEpochs = 5
         for epoch in 0..<nbEpochs
diff --git a/Tests/GrAITests/Base/Input1D/Input1DBCE1DCase.swift b/Tests/GrAITests/Base/Input1D/Input1DBCE1DCase.swift
index 04c4e82a..5836c23d 100644
--- a/Tests/GrAITests/Base/Input1D/Input1DBCE1DCase.swift
+++ b/Tests/GrAITests/Base/Input1D/Input1DBCE1DCase.swift
@@ -25,7 +25,9 @@ class Input1DBCE1DCase: XCTestCase, Input1DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 3
diff --git a/Tests/GrAITests/Base/Input1D/Input1DBCESigmoid1DCase.swift b/Tests/GrAITests/Base/Input1D/Input1DBCESigmoid1DCase.swift
index b869b67a..1f7efa8e 100644
--- a/Tests/GrAITests/Base/Input1D/Input1DBCESigmoid1DCase.swift
+++ b/Tests/GrAITests/Base/Input1D/Input1DBCESigmoid1DCase.swift
@@ -25,7 +25,9 @@ class Input1DBCESigmoid1DCase: XCTestCase, Input1DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 3
diff --git a/Tests/GrAITests/Base/Input1D/Input1DLinearError1DCase.swift b/Tests/GrAITests/Base/Input1D/Input1DLinearError1DCase.swift
index e6588e96..9dd60ae7 100644
--- a/Tests/GrAITests/Base/Input1D/Input1DLinearError1DCase.swift
+++ b/Tests/GrAITests/Base/Input1D/Input1DLinearError1DCase.swift
@@ -25,7 +25,9 @@ class Input1DLinearError1DCase: XCTestCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 3
diff --git a/Tests/GrAITests/Base/Input1D/Input1DMSE1DCase.swift b/Tests/GrAITests/Base/Input1D/Input1DMSE1DCase.swift
index 53b77e20..d3193b01 100644
--- a/Tests/GrAITests/Base/Input1D/Input1DMSE1DCase.swift
+++ b/Tests/GrAITests/Base/Input1D/Input1DMSE1DCase.swift
@@ -25,7 +25,9 @@ class Input1DMSE1DCase: XCTestCase, Input1DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 3
diff --git a/Tests/GrAITests/Base/Input2D/Input2DBCE2DCase.swift b/Tests/GrAITests/Base/Input2D/Input2DBCE2DCase.swift
index fad660b5..e145d4bf 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DBCE2DCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DBCE2DCase.swift
@@ -30,7 +30,9 @@ class Input2DBCE2DCase: XCTestCase, Input2DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 2
diff --git a/Tests/GrAITests/Base/Input2D/Input2DBCESigmoid2DCase.swift b/Tests/GrAITests/Base/Input2D/Input2DBCESigmoid2DCase.swift
index 69196dcc..15f4a8f2 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DBCESigmoid2DCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DBCESigmoid2DCase.swift
@@ -30,7 +30,9 @@ class Input2DBCESigmoid2DCase: XCTestCase, Input2DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 2
diff --git a/Tests/GrAITests/Base/Input2D/Input2DMSE1DCase.swift b/Tests/GrAITests/Base/Input2D/Input2DMSE1DCase.swift
index 6c4bd08b..cabea420 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DMSE1DCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DMSE1DCase.swift
@@ -28,7 +28,9 @@ class Input2DMSE1DCase: XCTestCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 3
diff --git a/Tests/GrAITests/Base/Input2D/Input2DMSE2DCase.swift b/Tests/GrAITests/Base/Input2D/Input2DMSE2DCase.swift
index 4cf3b5e3..809a6bfe 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DMSE2DCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DMSE2DCase.swift
@@ -30,7 +30,9 @@ class Input2DMSE2DCase: XCTestCase, Input2DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 2
diff --git a/Tests/GrAITests/Base/Input2D/Input2DSimilarityBatchError2DCase.swift b/Tests/GrAITests/Base/Input2D/Input2DSimilarityBatchError2DCase.swift
index 01ab5196..4cd2c64c 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DSimilarityBatchError2DCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DSimilarityBatchError2DCase.swift
@@ -30,7 +30,9 @@ class Input2DSimilarityBatchError2DCase: XCTestCase, Input2DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 2
diff --git a/Tests/GrAITests/Base/Input2D/Input2DSimilarityError2DCase.swift b/Tests/GrAITests/Base/Input2D/Input2DSimilarityError2DCase.swift
index fb9fb282..956a7cb2 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DSimilarityError2DCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DSimilarityError2DCase.swift
@@ -30,7 +30,9 @@ class Input2DSimilarityError2DCase: XCTestCase, Input2DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 2
diff --git a/Tests/GrAITests/Base/Input2D/Input2DVQ2DCase.swift b/Tests/GrAITests/Base/Input2D/Input2DVQ2DCase.swift
index d39c8496..878b810f 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DVQ2DCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DVQ2DCase.swift
@@ -30,7 +30,9 @@ class Input2DVQ2DCase: XCTestCase, Input2DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 2
diff --git a/Tests/GrAITests/Base/Input2D/Input2DVQSeqCase.swift b/Tests/GrAITests/Base/Input2D/Input2DVQSeqCase.swift
index 7a308997..d5b7ccf4 100644
--- a/Tests/GrAITests/Base/Input2D/Input2DVQSeqCase.swift
+++ b/Tests/GrAITests/Base/Input2D/Input2DVQSeqCase.swift
@@ -30,7 +30,9 @@ class Input2DVQSeqCase: XCTestCase, Input2DCase, IOCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 2
diff --git a/Tests/GrAITests/ImageTests.swift b/Tests/GrAITests/ImageTests.swift
index 8221dec4..7b304ce9 100644
--- a/Tests/GrAITests/ImageTests.swift
+++ b/Tests/GrAITests/ImageTests.swift
@@ -10,7 +10,6 @@ import XCTest
 import GrAIdient
 
 /// Test operations on images.
-@available(macOS 13.0, *)
 class ImageTests: XCTestCase
 {
     /// Directory containing input images.
@@ -55,7 +54,9 @@ class ImageTests: XCTestCase
     override func setUp()
     {
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     private func _buildModel(
@@ -360,8 +361,8 @@ class ImageTests: XCTestCase
         )
         
         let batchSize = imagesURL.count
-        let buffer = MetalPrivateBuffer<Float>(
-            batchSize * 3 * _size * _size, deviceID: 0
+        let buffer = FloatBuffer(nbElems: 
+            batchSize * 3 * _size * _size, deviceID: 0, shared: true
         )
         
         try! Image.loadImages(
diff --git a/Tests/GrAITests/Layer2DTests.swift b/Tests/GrAITests/Layer2DTests.swift
index 9171ef89..3d17dc81 100644
--- a/Tests/GrAITests/Layer2DTests.swift
+++ b/Tests/GrAITests/Layer2DTests.swift
@@ -1843,13 +1843,13 @@ class Layer2DFlowTests: Input2DMSE1DCase
     func testNormalize1() throws
     {
         let trainer = _buildTrainer(model: "Normalize1", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     func testNormalize12() throws
     {
         let trainer = _buildTrainer(model: "Normalize12", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     func testFlipHorizontal1() throws
@@ -2371,13 +2371,13 @@ class Layer2DFlowResetTests: Layer2DFlowTests
     override func testNormalize1() throws
     {
         let trainer = _buildTrainer(model: "Normalize1", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     override func testNormalize12() throws
     {
         let trainer = _buildTrainer(model: "Normalize12", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     override func testFlipHorizontal1() throws
@@ -2771,13 +2771,13 @@ class Layer2DFlowReverseTests: Layer2DFlowTests
     override func testNormalize1() throws
     {
         let trainer = _buildTrainer(model: "Normalize1", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     override func testNormalize12() throws
     {
         let trainer = _buildTrainer(model: "Normalize12", bn: false)
-        run(trainer)
+        run(trainer, diffThreshold: 0.0001)
     }
     
     override func testFlipHorizontal1() throws
diff --git a/Tests/GrAITests/OptimizerTests.swift b/Tests/GrAITests/OptimizerTests.swift
index 88c29e10..f5dc764c 100644
--- a/Tests/GrAITests/OptimizerTests.swift
+++ b/Tests/GrAITests/OptimizerTests.swift
@@ -18,7 +18,9 @@ class OptimizerTests: Input1DMSE1DCase
     {
         batchSize = 5
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 10
@@ -132,6 +134,7 @@ class OptimizerTests: Input1DMSE1DCase
     
     func testAdamRectified() throws
     {
+        optimizerParams.nbLoops = 5
         setOptimizerParams(params: &optimizerParams,
                            optimizerClass: .AdamRectified)
         let trainer = _buildTrainer()
@@ -140,6 +143,7 @@ class OptimizerTests: Input1DMSE1DCase
     
     func testAdamRectifiedDecay() throws
     {
+        optimizerParams.nbLoops = 5
         setOptimizerParams(params: &optimizerParams,
                            optimizerClass: .AdamRectified,
                            lambda: 1e-3)
diff --git a/Tests/GrAITests/ReduceTests.swift b/Tests/GrAITests/ReduceTests.swift
index b658f102..e4000ab3 100644
--- a/Tests/GrAITests/ReduceTests.swift
+++ b/Tests/GrAITests/ReduceTests.swift
@@ -11,19 +11,19 @@ import GrAIdient
 /// Test reduce sum kernel.
 class ReduceSumTests: XCTestCase
 {
-    var _buffer: MetalSharedBuffer<Float>! = nil
+    var _buffer: FloatBuffer! = nil
     var _array = [Float]()
     
     override func setUp()
     {
         _ = MetalKernel.get
+        GrAI.Opti.GPU = true
     }
     
-    private func _testBuffer(dim1: Int, dim2: Int)
+    private func _testBuffer(dim1: Int, dim2: Int, shared: Bool)
     {
         _array = [Float](repeating: 0.0, count: dim1 * dim2)
-        _buffer = MetalSharedBuffer(dim1 * dim2, deviceID: 0)
-        let buffer = _buffer.buffer
+        _buffer = FloatBuffer(nbElems: dim1 * dim2, deviceID: 0, shared: shared)
         
         for elem1 in 0..<dim1 {
         for elem2 in 0..<dim2
@@ -31,10 +31,8 @@ class ReduceSumTests: XCTestCase
             let offset = elem2 * dim1 + elem1
             let value = Float.random(in: 0..<1)
             _array[offset] = value
-            buffer[offset] = value
         }}
-        
-        MetalKernel.get.upload([_buffer])
+        _buffer.initialize(array: &_array)
         
         var resultsCPU = [Float]()
         for elem2 in 0..<dim2
@@ -55,50 +53,92 @@ class ReduceSumTests: XCTestCase
             deviceID: 0
         )
         
-        MetalKernel.get.download([_buffer])
-        let resultsGPU = [Float](_buffer.buffer)
-        
+        let resultsGPU = _buffer.download()
         for (resultCPU, resultGPU) in zip(resultsCPU, resultsGPU)
         {
             let diffPercent =
                 abs(resultCPU - resultGPU) / resultCPU * 100.0
-            XCTAssert(diffPercent < 0.001)
+            XCTAssert(diffPercent < 0.1)
         }
     }
     
-    func testVerySmall()
+    func testVerySmallFloat()
     {
         let dim1 = 2
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testVerySmallFloat16()
+    {
+        let dim1 = 2
+        let dim2 = 5
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testSmallFloat()
+    {
+        let dim1 = 50
+        let dim2 = 5
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
     
-    func testSmall()
+    func testSmallFloat16()
     {
         let dim1 = 50
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testBigFloat()
+    {
+        let dim1 = 2000
+        let dim2 = 5
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
     
-    func testBig()
+    func testBigFloat16()
     {
         let dim1 = 2000
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testVeryBigFloat()
+    {
+        let dim1 = 10000
+        let dim2 = 5
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
     
-    func testVeryBig()
+    func testVeryBigFloat16()
     {
         let dim1 = 10000
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
 }
 
 /// Test reduce max kernel.
 class ReduceMaxTests: XCTestCase
 {
-    var _buffer: MetalSharedBuffer<Float>! = nil
+    var _buffer: FloatBuffer! = nil
     var _array = [Float]()
     
     override func setUp()
@@ -106,11 +146,10 @@ class ReduceMaxTests: XCTestCase
         _ = MetalKernel.get
     }
     
-    private func _testBuffer(dim1: Int, dim2: Int)
+    private func _testBuffer(dim1: Int, dim2: Int, shared: Bool)
     {
         _array = [Float](repeating: 0.0, count: dim1 * dim2)
-        _buffer = MetalSharedBuffer(dim1 * dim2, deviceID: 0)
-        let buffer = _buffer.buffer
+        _buffer = FloatBuffer(nbElems: dim1 * dim2, deviceID: 0, shared: shared)
         
         for elem1 in 0..<dim1 {
         for elem2 in 0..<dim2
@@ -118,10 +157,8 @@ class ReduceMaxTests: XCTestCase
             let offset = elem2 * dim1 + elem1
             let value = Float.random(in: 0..<1)
             _array[offset] = value
-            buffer[offset] = value
         }}
-        
-        MetalKernel.get.upload([_buffer])
+        _buffer.initialize(array: &_array)
         
         var resultsCPU = [Float]()
         for elem2 in 0..<dim2
@@ -142,42 +179,84 @@ class ReduceMaxTests: XCTestCase
             deviceID: 0
         )
         
-        MetalKernel.get.download([_buffer])
-        let resultsGPU = [Float](_buffer.buffer)
-        
+        let resultsGPU = _buffer.download()
         for (resultCPU, resultGPU) in zip(resultsCPU, resultsGPU)
         {
             let diffPercent =
                 abs(resultCPU - resultGPU) / resultCPU * 100.0
-            XCTAssert(diffPercent < 0.001)
+            XCTAssert(diffPercent < 0.05)
         }
     }
     
-    func testVerySmall()
+    func testVerySmallFloat()
     {
         let dim1 = 2
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testVerySmallFloat16()
+    {
+        let dim1 = 2
+        let dim2 = 5
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testSmallFloat()
+    {
+        let dim1 = 50
+        let dim2 = 5
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
     
-    func testSmall()
+    func testSmallFloat16()
     {
         let dim1 = 50
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testBigFloat()
+    {
+        let dim1 = 2000
+        let dim2 = 5
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
     
-    func testBig()
+    func testBigFloat16()
     {
         let dim1 = 2000
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
+    }
+    
+    func testVeryBigFloat()
+    {
+        let dim1 = 10000
+        let dim2 = 5
+        GrAI.Precision.float = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
     
-    func testVeryBig()
+    func testVeryBigFloat16()
     {
         let dim1 = 10000
         let dim2 = 5
-        _testBuffer(dim1: dim1, dim2: dim2)
+        GrAI.Precision.float16 = true
+        _testBuffer(dim1: dim1, dim2: dim2, shared: true)
+        _testBuffer(dim1: dim1, dim2: dim2, shared: false)
     }
 }
diff --git a/Tests/GrAITests/UpdateManagementTests.swift b/Tests/GrAITests/UpdateManagementTests.swift
index b113acff..77077c4b 100644
--- a/Tests/GrAITests/UpdateManagementTests.swift
+++ b/Tests/GrAITests/UpdateManagementTests.swift
@@ -17,7 +17,9 @@ class UpdateManagementTests: XCTestCase
     override func setUp()
     {
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
         
         setOptimizerParams(params: &optimizerParams)
         optimizerParams.nbLoops = 1
@@ -221,11 +223,11 @@ class UpdateManagementTests: XCTestCase
             deviceID: DEVICE_ID
         )
         
-        let groundTruth = MetalSharedBuffer<Float>(
-            1, deviceID: DEVICE_ID
+        let groundTruth = FloatBuffer(
+            nbElems: 1, deviceID: DEVICE_ID
         )
-        groundTruth.buffer[0] = 0
-        MetalKernel.get.upload([groundTruth])
+        var buffer: [Float] = [0.0]
+        groundTruth.initialize(array: &buffer)
         
         let inputData1: [[Float]] = [[0.0]]
         let inputData2: [[Float]] = [[1.0]]
@@ -610,11 +612,11 @@ class UpdateManagementTests: XCTestCase
             deviceID: DEVICE_ID
         )
         
-        let groundTruth = MetalSharedBuffer<Float>(
-            1, deviceID: DEVICE_ID
+        let groundTruth = FloatBuffer(
+            nbElems: 1, deviceID: DEVICE_ID
         )
-        groundTruth.buffer[0] = 0
-        MetalKernel.get.upload([groundTruth])
+        var buffer: [Float] = [0.0]
+        groundTruth.initialize(array: &buffer)
         
         let inputData1: [Float] = [0.0]
         let inputData2: [Float] = [1.0]
diff --git a/Tests/GrAITorchTests/Base/setup.py b/Tests/GrAITorchTests/Base/setup.py
index aa80f954..7d7862e1 100644
--- a/Tests/GrAITorchTests/Base/setup.py
+++ b/Tests/GrAITorchTests/Base/setup.py
@@ -8,7 +8,7 @@
     license='MIT',
     install_requires=[
         "torch==1.13.1",
-        "torchvision==0.11.2",
+        "torchvision==0.14.1",
         "numpy==1.23.1",
         "pillow==9.2.0",
     ],
diff --git a/Tests/GrAITorchTests/GrAITorchTests.swift b/Tests/GrAITorchTests/GrAITorchTests.swift
index 16fe2128..a4e0b68f 100644
--- a/Tests/GrAITorchTests/GrAITorchTests.swift
+++ b/Tests/GrAITorchTests/GrAITorchTests.swift
@@ -21,7 +21,9 @@ final class GrAITorchTests: XCTestCase
     {
         setPythonLib()
         _ = MetalKernel.get
+        
         GrAI.Opti.GPU = true
+        GrAI.Precision.float = true
     }
     
     ///