diff --git a/CHANGELOG.md b/CHANGELOG.md
index af5d348b..c79f216d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
 
 ## [unreleased]
 
+⚙️ **core:** GELU vs GELUApprox ([113](https://github.com/owkin/GrAIdient/pull/113))\
 🚀 **perf:** QuerySelf & ValueSelf ([112](https://github.com/owkin/GrAIdient/pull/112))\
 🚀 **perf:** benchmark ViT base model ([111](https://github.com/owkin/GrAIdient/pull/111))\
 🐛 **fix:** run on Apple Silicon ([110](https://github.com/owkin/GrAIdient/pull/110))\
diff --git a/Sources/GrAIdient/Core/Function/Activation.swift b/Sources/GrAIdient/Core/Function/Activation.swift
index 6171a184..edb79edd 100644
--- a/Sources/GrAIdient/Core/Function/Activation.swift
+++ b/Sources/GrAIdient/Core/Function/Activation.swift
@@ -767,23 +767,23 @@ public class Sigmoid: ActivationFunction
     }
 }
 
-/// GELU activation function.
-public class GELU: ActivationFunction
+/// GELU approximative activation function.
+public class GELUApprox: ActivationFunction
 {
-    public static let str = "GELU"
+    public static let str = "GELUApprox"
     
     /// Forward GPU kernel.
     public override var forwardKernel: String
     {
         get {
-            return "forwardGELU"
+            return "forwardGELUApprox"
         }
     }
     /// Backward GPU kernel.
     public override var backwardKernel: String
     {
         get {
-            return "backwardGELU"
+            return "backwardGELUApprox"
         }
     }
     
@@ -865,6 +865,83 @@ public class GELU: ActivationFunction
     }
 }
 
+/// GELU activation function.
+public class GELU: ActivationFunction
+{
+    public static let str = "GELU"
+    
+    /// Forward GPU kernel.
+    public override var forwardKernel: String
+    {
+        get {
+            return "forwardGELU"
+        }
+    }
+    /// Backward GPU kernel.
+    public override var backwardKernel: String
+    {
+        get {
+            return "backwardGELU"
+        }
+    }
+    
+    ///
+    /// Coefficient to apply during the weights initialization.
+    ///
+    /// - Returns: The coefficient.
+    ///
+    open override var coeffInitWeights: Float
+    {
+        get {
+            return Float(sqrt(2.0))
+        }
+    }
+    
+    /// Create a GELU activation function.
+    init()
+    {
+        super.init(GELU.str)
+    }
+    
+    ///
+    /// Decode from the disk.
+    ///
+    /// Throw an error if reading from the decoder fails, or
+    /// if the data read is corrupted or otherwise invalid.
+    ///
+    /// - Parameter decoder: The decoder to read data from.
+    ///
+    required public init(from decoder: Decoder) throws
+    {
+        try super.init(from: decoder)
+    }
+    
+    ///
+    /// Forward CPU.
+    ///
+    /// - Parameter x: The input.
+    /// - Returns: The output.
+    ///
+    public override func apply(_ x: Double) -> Double
+    {
+        return 0.5 * x * (1 + erf(x / sqrt(2.0)))
+    }
+    
+    ///
+    /// Backward CPU.
+    ///
+    /// - Parameter x: The input.
+    /// - Returns: The output.
+    ///
+    public override func derivate(_ x: Double) -> Double
+    {
+        let tmp1 = 0.5 * (1.0 + erf(x / sqrt(2.0)))
+        let tmp2 = x / sqrt(2.0 * Double.pi) * exp(-x * x / 2.0)
+        let derivative = tmp1 + tmp2
+        return derivative
+    }
+}
+
 /// Factory API to build an activation function.
 public protocol ActivationKernel
 {
@@ -886,6 +963,7 @@ class ActivationKernelImpl: ActivationKernel
         LeakyReLU.str: LeakyReLUKernel(),
         SoftReLU.str: SoftReLUKernel(),
         Sigmoid.str: SigmoidKernel(),
+        GELUApprox.str: GELUApproxKernel(),
         GELU.str: GELUKernel()
     ]
     
@@ -954,7 +1032,17 @@ private class SigmoidKernel: ActivationKernelImpl
     }
 }
 
-/// Factory to build a Sigmoid function.
+/// Factory to build a GELU approximative function.
+private class GELUApproxKernel: ActivationKernelImpl
+{
+    /// Build a Sigmoid function.
+    override func build() -> ActivationFunction
+    {
+        return GELUApprox()
+    }
+}
+
+/// Factory to build a GELU function.
 private class GELUKernel: ActivationKernelImpl
 {
     /// Build a Sigmoid function.
diff --git a/Sources/GrAIdient/LayerSeq/ValueSeq.swift b/Sources/GrAIdient/LayerSeq/ValueSeq.swift
index e68c841e..09d6b70a 100644
--- a/Sources/GrAIdient/LayerSeq/ValueSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/ValueSeq.swift
@@ -998,6 +998,16 @@ public class ValueSelfSeq: LayerMergeSeq
         
         if _layersPrev[0].computeDelta
         {
+            if _layersPrev[0].dirty
+            {
+                for elem in 0..<batchSize {
+                for seqK in 0..<sequence {
+                for depth in 0..<nbNeurons * _nbBlocksPrev
+                {
+                    value.get(seqK, depth)!.v[elem].delta = 0
+                }}}
+            }
+            
             for elem in 0..<batchSize {
             for head in 0..<_nbHeads {
             for seqK in 0..<sequence {
@@ -1015,18 +1025,9 @@ public class ValueSelfSeq: LayerMergeSeq
                     sum += deltaCur * scoreTmp
                 }
                 
-                if _layersPrev[0].dirty
-                {
-                    value.get(
-                        seqK, depth + _valueOffset * nbNeurons
-                    )!.v[elem].delta = sum
-                }
-                else
-                {
-                    value.get(
-                        seqK, depth + _valueOffset * nbNeurons
-                    )!.v[elem].delta += sum
-                }
+                value.get(
+                    seqK, depth + _valueOffset * nbNeurons
+                )!.v[elem].delta += sum
             }}}}
         }
         if _layersPrev[1].computeDelta
@@ -1095,7 +1096,20 @@ public class ValueSelfSeq: LayerMergeSeq
         {
             try value.checkStateBackwardGPU(batchSize: batchSize)
             
-            let pDirty: [UInt32] = value.dirty ? [1] : [0]
+            if value.dirty
+            {
+                let nbElems = value.delta.nbElems
+                let pNbElems: [UInt32] = [UInt32(nbElems)]
+                
+                command = MetalKernel.get.createCommand(
+                    "reset", deviceID: deviceID
+                )
+                command.setBytes(pNbElems, atIndex: 0)
+                command.setBuffer(value.delta.metal, atIndex: 1)
+                
+                command.dispatchThreads(nbElems)
+                command.enqueue()
+            }
             
             let kernel = (nbNeurons / _nbHeads) % 4 == 0 ?
                 "valueSelfValueSeq4Backward" : "valueSelfValueSeqBackward"
@@ -1112,8 +1126,7 @@ public class ValueSelfSeq: LayerMergeSeq
             command.setBytes(pGlobalOffset, atIndex: 6)
             command.setBytes(pNbBatch, atIndex: 7)
             command.setBytes(pSequence, atIndex: 8)
-            command.setBytes(pDirty, atIndex: 9)
-            command.setBuffer(value.delta.metal, atIndex: 10)
+            command.setBuffer(value.delta.metal, atIndex: 9)
             
             command.dispatchThreads(
                 width: nbNeurons / coeff,
diff --git a/Sources/GrAIdient/Metal/Kernel/Activation.metal b/Sources/GrAIdient/Metal/Kernel/Activation.metal
index 2e0895ac..a4371bbb 100644
--- a/Sources/GrAIdient/Metal/Kernel/Activation.metal
+++ b/Sources/GrAIdient/Metal/Kernel/Activation.metal
@@ -239,7 +239,7 @@ kernel void backwardSigmoid(
     delta[id] = delta[id] * derivative;
 }
 
-kernel void forwardGELU(
+kernel void forwardGELUApprox(
    constant uint * pNbElems,
    device float * tmps,
    device float * outs,
@@ -275,7 +275,7 @@ kernel void forwardGELU(
     outs[id] = 0.5 * x * (1 + tmp2);
 }
 
-kernel void backwardGELU(
+kernel void backwardGELUApprox(
     const device float * tmps,
     constant uint * pNbElems,
     device float * delta,
@@ -311,3 +311,93 @@ kernel void backwardGELU(
     float derivative = 0.5 * (1 + tmp2 + x * tmp3);
     delta[id] = delta[id] * derivative;
 }
+
+/*
+ * Approximation to the error function.
+ * Based on code from:
+ * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199
+ */
+float erf(float a)
+{
+    float r, s, t, u;
+    t = metal::abs(a);
+    s = a * a;
+    if (t > 0.927734375f)
+    {
+        // maximum error 0.99527 ulp
+        r = metal::fma(-1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+        u = metal::fma(-3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+        r = metal::fma(r, s, u);
+        r = metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+        r = metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+        r = metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+        r = metal::fma(r, t, -t);
+        // TODO, replace with expm1 when implemented
+        r = 1.0f - metal::exp(r);
+        r = metal::copysign(r, a);
+    }
+    else
+    {
+        // maximum error 0.98929 ulp
+        r = -5.96761703e-4f; // -0x1.38e000p-11
+        r = metal::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
+        r = metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+        r = metal::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
+        r = metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
+        r = metal::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
+        r = metal::fma(r, a, a);
+    }
+    return r;
+}
+
+kernel void forwardGELU(
+   constant uint * pNbElems,
+   device float * tmps,
+   device float * outs,
+   uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float x = outs[id];
+    tmps[id] = x;
+    outs[id] = 0.5 * x * (1 + erf(x / sqrt(2.0)));
+}
+
+kernel void backwardGELU(
+    const device float * tmps,
+    constant uint * pNbElems,
+    device float * delta,
+    uint id [[ thread_position_in_grid ]])
+{
+    uint nbElems;
+    
+    if (pNbElems)
+    {
+        nbElems = pNbElems[0];
+    }
+    else
+        return ;
+    
+    if (id >= nbElems)
+    {
+        return ;
+    }
+    
+    float x = tmps[id];
+    float tmp1 = 0.5 * (1.0 + erf(x / sqrt(2.0)));
+    float tmp2 = x / sqrt(2.0 * M_PI_F) * exp(-x * x / 2.0);
+    float derivative = tmp1 + tmp2;
+    delta[id] = delta[id] * derivative;
+}
diff --git a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal b/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal
index 4c551f4b..8502fbcb 100644
--- a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal
+++ b/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal
@@ -2401,7 +2401,6 @@ kernel void valueSelfValueSeqBackward(
     constant uint * pGlobalOffset,
     constant uint * pNbBatch,
     constant uint * pSequence,
-    constant uint * pDirty,
     device float * value,
     uint2 id [[ thread_position_in_grid ]])
 {
@@ -2414,10 +2413,9 @@ kernel void valueSelfValueSeqBackward(
     uint nbBatch;
     uint sequence;
     uint size;
-    uint dirty;
     
     if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
-        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
         value && score && delta)
     {
         nbHeads = *pNbHeads;
@@ -2429,7 +2427,6 @@ kernel void valueSelfValueSeqBackward(
         nbBatch = *pNbBatch;
         sequence = *pSequence;
         size = nbNeurons2 / nbHeads;
-        dirty = *pDirty;
     }
     else
         return ;
@@ -2459,14 +2456,7 @@ kernel void valueSelfValueSeqBackward(
     
     uint offsetValue = depth + valueOffset * nbNeurons2 +
         nbNeurons1 * seqK + sequence * nbNeurons1 * elem;
-    if (dirty)
-    {
-        value[offsetValue] = tmp;
-    }
-    else
-    {
-        value[offsetValue] += tmp;
-    }
+    value[offsetValue] += tmp;
 }
 
 kernel void valueSelfValueSeq4Backward(
@@ -2479,7 +2469,6 @@ kernel void valueSelfValueSeq4Backward(
     constant uint * pGlobalOffset,
     constant uint * pNbBatch,
     constant uint * pSequence,
-    constant uint * pDirty,
     device float4 * value,
     uint2 id [[ thread_position_in_grid ]])
 {
@@ -2492,10 +2481,9 @@ kernel void valueSelfValueSeq4Backward(
     uint nbBatch;
     uint sequence;
     uint size;
-    uint dirty;
     
     if (pNbHeads && pNbNeurons && pNbNeuronsPrev &&
-        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence && pDirty &&
+        pNbBlocksPrev && pGlobalOffset && pNbBatch && pSequence &&
         value && score && delta)
     {
         nbHeads = *pNbHeads;
@@ -2507,7 +2495,6 @@ kernel void valueSelfValueSeq4Backward(
         nbBatch = *pNbBatch;
         sequence = *pSequence;
         size = nbNeurons2 / nbHeads;
-        dirty = *pDirty;
     }
     else
         return ;
@@ -2538,14 +2525,7 @@ kernel void valueSelfValueSeq4Backward(
     
     uint offsetValue = (depth + valueOffset * nbNeurons2 +
         nbNeurons1 * seqK + sequence * nbNeurons1 * elem) / 4;
-    if (dirty)
-    {
-        value[offsetValue] = tmp;
-    }
-    else
-    {
-        value[offsetValue] += tmp;
-    }
+    value[offsetValue] += tmp;
 }
 
 kernel void valueSelfScoreSeqBackward(
diff --git a/Sources/GrAIdient/Metal/MetalConfig.swift b/Sources/GrAIdient/Metal/MetalConfig.swift
index 6b1e04e7..3e8f3151 100644
--- a/Sources/GrAIdient/Metal/MetalConfig.swift
+++ b/Sources/GrAIdient/Metal/MetalConfig.swift
@@ -16,6 +16,8 @@ let CONFIG_KERNELS =
         "backwardSoftReLU",
         "forwardSigmoid",
         "backwardSigmoid",
+        "forwardGELUApprox",
+        "backwardGELUApprox",
         "forwardGELU",
         "backwardGELU",
     ],
diff --git a/Sources/GrAIdient/Utils/Concurrency.swift b/Sources/GrAIdient/Utils/Concurrency.swift
index 7c28366c..cb62a1f2 100644
--- a/Sources/GrAIdient/Utils/Concurrency.swift
+++ b/Sources/GrAIdient/Utils/Concurrency.swift
@@ -7,6 +7,40 @@
 
 import Foundation
 
+///
+/// Split an ensemble of elements into "balanced" batches.
+///
+/// - Parameters :
+///     - nbElems: The number of elements in the ensemble.
+///     - nbSplits: The number of batch splits.
+/// - Returns: The list of (start, end) indices for the different batches.
+///
+func splitBatch(
+    nbElems: Int, nbSplits: Int
+) -> [(start: Int, end: Int)]
+{
+    var batchRanges = [(start: Int, end: Int)]()
+    let batchSize = nbElems / nbSplits
+    let remaining = nbElems % nbSplits
+    
+    var cur = 0
+    for block in 0..<nbSplits
+    {
+        var batchSizeTmp = min(batchSize, nbElems - cur)
+        if block < remaining
+        {
+            if batchSizeTmp != batchSize
+            {
+                fatalError()
+            }
+            batchSizeTmp += 1
+        }
+        batchRanges.append((start: cur, end: cur + batchSizeTmp))
+        cur += batchSizeTmp
+    }
+    return batchRanges
+}
+
 /// A namespace  to execute functions in parallel.
 public class Concurrency
 {
@@ -19,31 +53,23 @@ public class Concurrency
     ///     
     public static func slice(_ nbElems: Int, _ block: (Int) -> ())
     {
-        let nbThreads = ProcessInfo.processInfo.activeProcessorCount
-        if nbElems >= nbThreads
+        let nbThreads = min(
+            nbElems, ProcessInfo.processInfo.activeProcessorCount
+        )
+        if nbThreads > 1
         {
+            let batchRanges = splitBatch(nbElems: nbElems, nbSplits: nbThreads)
             DispatchQueue.concurrentPerform(iterations: nbThreads)
             {
                 (thread: Int) in
                 
-                let nbElemsPerThread = nbElems / nbThreads
-                let start = thread * nbElemsPerThread
-                let end = min(nbElems, (thread+1) * nbElemsPerThread)
-                
-                for elem in start..<end
+                let range = batchRanges[thread]
+                for elem in range.start..<range.end
                 {
                     block(elem)
                 }
             }
         }
-        else if nbElems > 1
-        {
-            DispatchQueue.concurrentPerform(iterations: nbElems)
-            {
-                (thread: Int) in
-                block(thread)
-            }
-        }
         else if nbElems == 1
         {
             block(0)
diff --git a/Tests/GrAIExamples/TransformerBenchmark.swift b/Tests/GrAIExamples/TransformerBenchmark.swift
index ae7c2455..3265c401 100644
--- a/Tests/GrAIExamples/TransformerBenchmark.swift
+++ b/Tests/GrAIExamples/TransformerBenchmark.swift
@@ -215,7 +215,7 @@ final class TransformerBenchmark: XCTestCase
     }
     
     /// Test: train a ViT model.
-    func test_TrainTransformer()
+    func _test_TrainTransformer()
     {
         // Get optimizer parameters for iterating over batch size elements.
         let params = _getOptimizerParams(nbLoops: _batchSize)
@@ -329,7 +329,7 @@ final class TransformerBenchmark: XCTestCase
     }
     
     /// Test: evaluate a ViT model.
-    func test_EvalTransformer()
+    func _test_EvalTransformer()
     {
         // Build a model with randomly initialized weights.
         let transformer = _buildModel(
diff --git a/Tests/GrAITests/Activation1DTests.swift b/Tests/GrAITests/Activation1DTests.swift
index 67716e23..4b3aa426 100644
--- a/Tests/GrAITests/Activation1DTests.swift
+++ b/Tests/GrAITests/Activation1DTests.swift
@@ -164,6 +164,23 @@ class Activation1DGradTests: Input1DMSE1DCase
         run(trainer)
     }
     
+    func testFLGELUApproxCPU() throws
+    {
+        GrAI.Opti.CPU = true
+        let trainer = _buildTrainer(
+            model: "FullyConnected", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
+    func testFLGELUApproxGPU() throws
+    {
+        let trainer = _buildTrainer(
+            model: "FullyConnected", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
     func testFLGELUCPU() throws
     {
         GrAI.Opti.CPU = true
@@ -249,6 +266,23 @@ class Activation1DGradTests: Input1DMSE1DCase
         run(trainer)
     }
     
+    func testGELUApproxCPU() throws
+    {
+        GrAI.Opti.CPU = true
+        let trainer = _buildTrainer(
+            model: "Activation", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
+    func testGELUApproxGPU() throws
+    {
+        let trainer = _buildTrainer(
+            model: "Activation", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
     func testGELUCPU() throws
     {
         GrAI.Opti.CPU = true
diff --git a/Tests/GrAITests/Activation2DTests.swift b/Tests/GrAITests/Activation2DTests.swift
index 852e19f2..0f821e63 100644
--- a/Tests/GrAITests/Activation2DTests.swift
+++ b/Tests/GrAITests/Activation2DTests.swift
@@ -256,6 +256,40 @@ class Activation2DGradTests: Input2DMSE1DCase
         run(trainer)
     }
     
+    func testConvGELUApproxNoBNCPU() throws
+    {
+        GrAI.Opti.CPU = true
+        let trainer = _buildTrainer(
+            model: "Convolution", activation: GELUApprox.str, bn: false
+        )
+        run(trainer)
+    }
+    
+    func testConvGELUApproxBNCPU() throws
+    {
+        GrAI.Opti.CPU = true
+        let trainer = _buildTrainer(
+            model: "Convolution", activation: GELUApprox.str, bn: true
+        )
+        run(trainer)
+    }
+    
+    func testConvGELUApproxNoBNGPU() throws
+    {
+        let trainer = _buildTrainer(
+            model: "Convolution", activation: GELUApprox.str, bn: false
+        )
+        run(trainer)
+    }
+    
+    func testConvGELUApproxBNGPU() throws
+    {
+        let trainer = _buildTrainer(
+            model: "Convolution", activation: GELUApprox.str, bn: true
+        )
+        run(trainer)
+    }
+    
     func testConvGELUNoBNCPU() throws
     {
         GrAI.Opti.CPU = true
@@ -358,6 +392,23 @@ class Activation2DGradTests: Input2DMSE1DCase
         run(trainer)
     }
     
+    func testGELUApproxCPU() throws
+    {
+        GrAI.Opti.CPU = true
+        let trainer = _buildTrainer(
+            model: "Activation", activation: GELUApprox.str, bn: false
+        )
+        run(trainer)
+    }
+    
+    func testGELUApproxGPU() throws
+    {
+        let trainer = _buildTrainer(
+            model: "Activation", activation: GELUApprox.str, bn: false
+        )
+        run(trainer)
+    }
+    
     func testGELUCPU() throws
     {
         GrAI.Opti.CPU = true
diff --git a/Tests/GrAITests/ActivationSeqTests.swift b/Tests/GrAITests/ActivationSeqTests.swift
index 5eda7487..da7bb90c 100644
--- a/Tests/GrAITests/ActivationSeqTests.swift
+++ b/Tests/GrAITests/ActivationSeqTests.swift
@@ -171,6 +171,23 @@ class ActivationSeqGradTests: Input2DMSE1DCase
         run(trainer)
     }
     
+    func testFLGELUApproxCPU() throws
+    {
+        GrAI.Opti.CPU = true
+        let trainer = _buildTrainer(
+            model: "FullyConnected", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
+    func testFLGELUApproxGPU() throws
+    {
+        let trainer = _buildTrainer(
+            model: "FullyConnected", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
     func testFLGELUCPU() throws
     {
         GrAI.Opti.CPU = true
@@ -256,6 +273,23 @@ class ActivationSeqGradTests: Input2DMSE1DCase
         run(trainer)
     }
     
+    func testGELUApproxCPU() throws
+    {
+        GrAI.Opti.CPU = true
+        let trainer = _buildTrainer(
+            model: "Activation", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
+    func testGELUApproxGPU() throws
+    {
+        let trainer = _buildTrainer(
+            model: "Activation", activation: GELUApprox.str
+        )
+        run(trainer)
+    }
+    
     func testGELUCPU() throws
     {
         GrAI.Opti.CPU = true