diff --git a/CHANGELOG.md b/CHANGELOG.md
index 409fd909..bcf6fbd8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,8 @@ All notable changes to this project will be documented in this file.
 
 ## [unreleased]
 
-🚀 **examples**: 3 LLMs examples ([#130](https://github.com/owkin/GrAIdient/pull/130))\
+✨ **layer_seq:** LLM sliding window ([#131](https://github.com/owkin/GrAIdient/pull/131))\
+🚀 **examples:** 3 LLMs examples ([#130](https://github.com/owkin/GrAIdient/pull/130))\
 📚 **docs:** LLM doc & split tests ([129](https://github.com/owkin/GrAIdient/pull/129))\
 ✨ **layer_seq:** LLM generate ([128](https://github.com/owkin/GrAIdient/pull/128))\
 ✨ **layer_seq:** MultiplySeq, SiLU & LLM test ([127](https://github.com/owkin/GrAIdient/pull/127))\
diff --git a/Sources/GrAIdient/LayerSeq/QuerySeq.swift b/Sources/GrAIdient/LayerSeq/QuerySeq.swift
index e4c4fd06..11330ecb 100644
--- a/Sources/GrAIdient/LayerSeq/QuerySeq.swift
+++ b/Sources/GrAIdient/LayerSeq/QuerySeq.swift
@@ -1270,20 +1270,25 @@ public class QueryCausalSeq: LayerMergeSeq
         if cacheKey != nil && cacheSeq != nil &&
            cacheKey.nbElems != batchSize * cacheSeqMax * nbNeuronsPrevKey
         {
-            _cacheKeyTmp = FloatBuffer(
+            let cacheKeyTmp = FloatBuffer(
                 nbElems: batchSize * cacheSeqMax * nbNeuronsPrevKey,
                 deviceID: deviceID
             )
             
             let nbElems = batchSize * cacheSeq * nbNeuronsPrevKey
-            _copyGPU(nbElems: nbElems, from: cacheKey, to: _cacheKeyTmp)
+            _copyGPU(nbElems: nbElems, from: cacheKey, to: cacheKeyTmp)
             
             cacheKey = FloatBuffer(
                 nbElems: batchSize * cacheSeqMax * nbNeuronsPrevKey,
                 deviceID: deviceID
             )
             
-            _copyGPU(nbElems: nbElems, from: _cacheKeyTmp, to: cacheKey)
+            _copyGPU(nbElems: nbElems, from: cacheKeyTmp, to: cacheKey)
+            
+            if batchSize > 1
+            {
+                _cacheKeyTmp = cacheKeyTmp
+            }
         }
     }
     
@@ -1664,13 +1669,13 @@ public class QueryCausalSeq: LayerMergeSeq
             throw LayerError.Init(message: "`sequence` should be 1.")
         }
         
-        _concatGPU()
+        _mergeCacheGPU()
         
         let query = layersPrev[0] as! LayerSeq
         let key = layersPrev[1] as! LayerSeq
         let nbNeuronsPrevQuery = query.nbNeurons
         let nbNeuronsPrevKey = key.nbNeurons
-        let nbNeurons = (cacheSeq + 1) * _nbHeadsQuery
+        let nbNeurons = min(cacheSeq + 1, cacheSeqMax) * _nbHeadsQuery
         
         let pNbHeadsQuery: [UInt32] = [UInt32(_nbHeadsQuery)]
         let pNbHeadsKey: [UInt32] = [UInt32(_nbHeadsKey)]
@@ -1678,7 +1683,7 @@ public class QueryCausalSeq: LayerMergeSeq
         let pNbNeuronsPrevQuery: [UInt32] = [UInt32(nbNeuronsPrevQuery)]
         let pNbNeuronsPrevKey: [UInt32] = [UInt32(nbNeuronsPrevKey)]
         let pNbBatch: [UInt32] = [UInt32(batchSize)]
-        let pSequence: [UInt32] = [UInt32(cacheSeq + 1)]
+        let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))]
         
         let kernel = (nbNeuronsPrevQuery / _nbHeadsQuery) % 4 == 0 ?
             "queryCausalSeq4Generate" : "queryCausalSeqGenerate"
@@ -1686,7 +1691,7 @@ public class QueryCausalSeq: LayerMergeSeq
             kernel, deviceID: deviceID
         )
         command.setBuffer(query.outs.metal, atIndex: 0)
-        command.setBuffer(_cacheKeyTmp.metal, atIndex: 1)
+        command.setBuffer(_getKeyCacheOutputGPU()!.metal, atIndex: 1)
         command.setBytes(pNbHeadsQuery, atIndex: 2)
         command.setBytes(pNbHeadsKey, atIndex: 3)
         command.setBytes(pNbNeurons, atIndex: 4)
@@ -1702,22 +1707,29 @@ public class QueryCausalSeq: LayerMergeSeq
         )
         command.enqueue()
         
-        let nbElems = batchSize * (cacheSeq + 1) * nbNeuronsPrevKey
-        _copyGPU(nbElems: nbElems, from: _cacheKeyTmp, to: cacheKey)
-        
         cacheSeq += 1
     }
     
-    /// Concatenate cache to key.
-    private func _concatGPU()
+    /// Merge cache to key.
+    private func _mergeCacheGPU()
     {
+        let slidingWindow: Bool
+        if cacheSeq >= cacheSeqMax
+        {
+            slidingWindow = true
+        }
+        else
+        {
+            slidingWindow = false
+        }
+        
         let key = layersPrev[1] as! LayerSeq
         let nbNeuronsPrevKey = key.nbNeurons
         let nbNeurons = nbNeuronsPrevKey
         
         let pNbNeurons: [UInt32] = [UInt32(nbNeurons)]
         let pNbBatch: [UInt32] = [UInt32(batchSize)]
-        let pSequence: [UInt32] = [UInt32(cacheSeq + 1)]
+        let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))]
         let pSequenceCache: [UInt32] = [UInt32(cacheSeq)]
         let pSequenceKey: [UInt32] = [UInt32(1)]
         
@@ -1725,32 +1737,41 @@ public class QueryCausalSeq: LayerMergeSeq
         var command: MetalCommand
         
         var globalOffset = 0
-        
-        var pGlobalOffset: [UInt32] = [UInt32(globalOffset)]
-        
         let kernel = nbNeurons % 4 == 0 ?
             "concat1Seq4Forward" : "concat1SeqForward"
         let coeff = nbNeurons % 4 == 0 ? 4 : 1
-        command = metalKernel.createCommand(
-            kernel, deviceID: deviceID
-        )
-        command.setBuffer(cacheKey.metal, atIndex: 0)
-        command.setBytes(pGlobalOffset, atIndex: 1)
-        command.setBytes(pNbNeurons, atIndex: 2)
-        command.setBytes(pNbBatch, atIndex: 3)
-        command.setBytes(pSequence, atIndex: 4)
-        command.setBytes(pSequenceCache, atIndex: 5)
-        command.setBuffer(_cacheKeyTmp.metal, atIndex: 6)
         
-        command.dispatchThreads(
-            width: nbNeurons / coeff,
-            height: batchSize * cacheSeq
-        )
-        command.enqueue()
+        if batchSize != 1 && !slidingWindow
+        {
+            let pGlobalOffset: [UInt32] = [UInt32(globalOffset)]
+            
+            command = metalKernel.createCommand(
+                kernel, deviceID: deviceID
+            )
+            command.setBuffer(_getKeyCacheInputGPU()!.metal, atIndex: 0)
+            command.setBytes(pGlobalOffset, atIndex: 1)
+            command.setBytes(pNbNeurons, atIndex: 2)
+            command.setBytes(pNbBatch, atIndex: 3)
+            command.setBytes(pSequence, atIndex: 4)
+            command.setBytes(pSequenceCache, atIndex: 5)
+            command.setBuffer(_getKeyCacheOutputGPU()!.metal, atIndex: 6)
+            
+            command.dispatchThreads(
+                width: nbNeurons / coeff,
+                height: batchSize * cacheSeq
+            )
+            command.enqueue()
+        }
         
-        globalOffset += cacheSeq
+        globalOffset += cacheSeq % cacheSeqMax
+        // TODO: when using sliding window with an instruct model,
+        // it is risky to erase the header information!
+        // if cacheSeq >= cacheSeqMax
+        // {
+        //     globalOffset += 5
+        // }
         
-        pGlobalOffset = [UInt32(globalOffset)]
+        let pGlobalOffset = [UInt32(globalOffset)]
         
         command = metalKernel.createCommand(
             kernel, deviceID: deviceID
@@ -1761,7 +1782,7 @@ public class QueryCausalSeq: LayerMergeSeq
         command.setBytes(pNbBatch, atIndex: 3)
         command.setBytes(pSequence, atIndex: 4)
         command.setBytes(pSequenceKey, atIndex: 5)
-        command.setBuffer(_cacheKeyTmp.metal, atIndex: 6)
+        command.setBuffer(_getKeyCacheOutputGPU()!.metal, atIndex: 6)
         
         command.dispatchThreads(
             width: nbNeurons / coeff,
@@ -1770,6 +1791,67 @@ public class QueryCausalSeq: LayerMergeSeq
         command.enqueue()
     }
     
+    ///
+    /// Get key cache buffer to use as input in Metal kernel.
+    ///
+    /// - Returns: key cache to use as input.
+    ///
+    private func _getKeyCacheInputGPU() -> FloatBuffer?
+    {
+        if cacheSeq != nil
+        {
+            if cacheSeq % 2 == 0
+            {
+                return _cacheKeyTmp
+            }
+            else
+            {
+                return cacheKey
+            }
+        }
+        return nil
+    }
+    
+    ///
+    /// Get key cache buffer to use as input in Metal kernel.
+    ///
+    /// - Returns: key cache to use as input.
+    ///
+    private func _getKeyCacheOutputGPU() -> FloatBuffer?
+    {
+        if cacheSeq != nil
+        {
+            if batchSize == 1
+            {
+                return cacheKey
+            }
+            else
+            {
+                if cacheSeq >= cacheSeqMax  // sliding window
+                {
+                    // The cache key has not changed.
+                    if (cacheSeqMax - 1) % 2 == 0
+                    {
+                        return cacheKey
+                    }
+                    else
+                    {
+                        return _cacheKeyTmp
+                    }
+                }
+                else if cacheSeq % 2 == 0
+                {
+                    return cacheKey
+                }
+                else
+                {
+                    return _cacheKeyTmp
+                }
+            }
+        }
+        return nil
+    }
+    
     /// Apply the forward pass in the GPU execution context.
     private func _forwardGPU()
     {
diff --git a/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift b/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift
index bff11333..375ea688 100644
--- a/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/SoftmaxSeq.swift
@@ -363,11 +363,15 @@ public class SoftmaxSeq: LayerSeq
 ///
 public class SoftmaxCausalSeq: SoftmaxSeq
 {
+    /// Maximal sequence of cache.
+    public var cacheSeqMax = 128
+    
     /// Current cache sequence.
     public var cacheSeq: Int! = nil
     
     private enum Keys: String, CodingKey
     {
+        case cacheSeqMax
         case cacheSeq
     }
     
@@ -401,6 +405,7 @@ public class SoftmaxCausalSeq: SoftmaxSeq
     public required init(from decoder: Decoder) throws
     {
         let values = try decoder.container(keyedBy: Keys.self)
+        cacheSeqMax = try values.decode(Int.self, forKey: Keys.cacheSeqMax)
         cacheSeq = try values.decodeIfPresent(Int.self, forKey: .cacheSeq)
         try super.init(from: decoder)
     }
@@ -419,6 +424,7 @@ public class SoftmaxCausalSeq: SoftmaxSeq
     public override func encode(to encoder: Encoder) throws
     {
         var container = encoder.container(keyedBy: Keys.self)
+        try container.encode(cacheSeqMax, forKey: Keys.cacheSeqMax)
         if cacheSeq != nil
         {
             try container.encode(cacheSeq, forKey: Keys.cacheSeq)
@@ -453,6 +459,8 @@ public class SoftmaxCausalSeq: SoftmaxSeq
             nbHeads: _nbHeads,
             params: params
         )
+        
+        layer.cacheSeqMax = cacheSeqMax
         layer.cacheSeq = cacheSeq
         
         return layer
@@ -507,7 +515,7 @@ public class SoftmaxCausalSeq: SoftmaxSeq
         
         if let layerPrev = self.layerPrev as? LayerSeq
         {
-            let nbNeurons = (cacheSeq + 1) * _nbHeads
+            let nbNeurons = min(cacheSeq + 1, cacheSeqMax) * _nbHeads
             
             let pNbHeads: [UInt32] = [UInt32(_nbHeads)]
             let pNbNeurons: [UInt32] = [UInt32(nbNeurons)]
diff --git a/Sources/GrAIdient/LayerSeq/ValueSeq.swift b/Sources/GrAIdient/LayerSeq/ValueSeq.swift
index 2c5d2e59..6267b718 100644
--- a/Sources/GrAIdient/LayerSeq/ValueSeq.swift
+++ b/Sources/GrAIdient/LayerSeq/ValueSeq.swift
@@ -1344,20 +1344,25 @@ public class ValueCausalSeq: LayerMergeSeq
         if cacheValue != nil && cacheSeq != nil &&
            cacheValue.nbElems != batchSize * cacheSeqMax * nbNeuronsPrevValue
         {
-            _cacheValueTmp = FloatBuffer(
+            let cacheValueTmp = FloatBuffer(
                 nbElems: batchSize * cacheSeqMax * nbNeuronsPrevValue,
                 deviceID: deviceID
             )
             
             let nbElems = batchSize * cacheSeq * nbNeuronsPrevValue
-            _copyGPU(nbElems: nbElems, from: cacheValue, to: _cacheValueTmp)
+            _copyGPU(nbElems: nbElems, from: cacheValue, to: cacheValueTmp)
             
             cacheValue = FloatBuffer(
                 nbElems: batchSize * cacheSeqMax * nbNeuronsPrevValue,
                 deviceID: deviceID
             )
             
-            _copyGPU(nbElems: nbElems, from: _cacheValueTmp, to: cacheValue)
+            _copyGPU(nbElems: nbElems, from: cacheValueTmp, to: cacheValue)
+            
+            if batchSize > 1
+            {
+                _cacheValueTmp = cacheValueTmp
+            }
         }
     }
     
@@ -1658,12 +1663,12 @@ public class ValueCausalSeq: LayerMergeSeq
             throw LayerError.Init(message: "`sequence` should be 1.")
         }
         
-        _concatGPU()
+        _mergeCacheGPU()
         
         let value = layersPrev[0] as! LayerSeq
         let score = layersPrev[1] as! LayerSeq
         let nbNeuronsPrevValue = value.nbNeurons
-        let nbNeuronsPrevScore = score.nbNeurons
+        let nbNeuronsPrevScore = min(cacheSeq + 1, cacheSeqMax) * _nbHeadsScore
         
         let pNbHeadsValue: [UInt32] = [UInt32(_nbHeadsValue)]
         let pNbHeadsScore: [UInt32] = [UInt32(_nbHeadsScore)]
@@ -1671,7 +1676,7 @@ public class ValueCausalSeq: LayerMergeSeq
         let pNbNeuronsPrevValue: [UInt32] = [UInt32(nbNeuronsPrevValue)]
         let pNbNeuronsPrevScore: [UInt32] = [UInt32(nbNeuronsPrevScore)]
         let pNbBatch: [UInt32] = [UInt32(batchSize)]
-        let pSequence: [UInt32] = [UInt32(cacheSeq + 1)]
+        let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))]
         
         let kernel = (nbNeurons / _nbHeadsScore) % 4 == 0 ?
             "valueCausalSeq4Generate" : "valueCausalSeqGenerate"
@@ -1679,7 +1684,7 @@ public class ValueCausalSeq: LayerMergeSeq
         let command = MetalKernel.get.createCommand(
             kernel, deviceID: deviceID
         )
-        command.setBuffer(_cacheValueTmp.metal, atIndex: 0)
+        command.setBuffer(_getValueCacheOutputGPU()!.metal, atIndex: 0)
         command.setBuffer(score.outs.metal, atIndex: 1)
         command.setBytes(pNbHeadsValue, atIndex: 2)
         command.setBytes(pNbHeadsScore, atIndex: 3)
@@ -1696,22 +1701,29 @@ public class ValueCausalSeq: LayerMergeSeq
         )
         command.enqueue()
         
-        let nbElems = batchSize * (cacheSeq + 1) * nbNeuronsPrevValue
-        _copyGPU(nbElems: nbElems, from: _cacheValueTmp, to: cacheValue)
-        
         cacheSeq += 1
     }
     
     /// Concatenate cache to key.
-    private func _concatGPU()
+    private func _mergeCacheGPU()
     {
+        let slidingWindow: Bool
+        if cacheSeq >= cacheSeqMax
+        {
+            slidingWindow = true
+        }
+        else
+        {
+            slidingWindow = false
+        }
+        
         let value = layersPrev[0] as! LayerSeq
         let nbNeuronsPrevValue = value.nbNeurons
         let nbNeurons = nbNeuronsPrevValue
         
         let pNbNeurons: [UInt32] = [UInt32(nbNeurons)]
         let pNbBatch: [UInt32] = [UInt32(batchSize)]
-        let pSequence: [UInt32] = [UInt32(cacheSeq + 1)]
+        let pSequence: [UInt32] = [UInt32(min(cacheSeq + 1, cacheSeqMax))]
         let pSequenceCache: [UInt32] = [UInt32(cacheSeq)]
         let pSequenceValue: [UInt32] = [UInt32(1)]
         
@@ -1719,32 +1731,41 @@ public class ValueCausalSeq: LayerMergeSeq
         var command: MetalCommand
         
         var globalOffset = 0
-        
-        var pGlobalOffset: [UInt32] = [UInt32(globalOffset)]
-        
         let kernel = nbNeurons % 4 == 0 ?
             "concat1Seq4Forward" : "concat1SeqForward"
         let coeff = nbNeurons % 4 == 0 ? 4 : 1
-        command = metalKernel.createCommand(
-            kernel, deviceID: deviceID
-        )
-        command.setBuffer(cacheValue.metal, atIndex: 0)
-        command.setBytes(pGlobalOffset, atIndex: 1)
-        command.setBytes(pNbNeurons, atIndex: 2)
-        command.setBytes(pNbBatch, atIndex: 3)
-        command.setBytes(pSequence, atIndex: 4)
-        command.setBytes(pSequenceCache, atIndex: 5)
-        command.setBuffer(_cacheValueTmp.metal, atIndex: 6)
         
-        command.dispatchThreads(
-            width: nbNeurons / coeff,
-            height: batchSize * cacheSeq
-        )
-        command.enqueue()
+        if batchSize != 1 && !slidingWindow
+        {
+            let pGlobalOffset: [UInt32] = [UInt32(globalOffset)]
+            
+            command = metalKernel.createCommand(
+                kernel, deviceID: deviceID
+            )
+            command.setBuffer(_getValueCacheInputGPU()!.metal, atIndex: 0)
+            command.setBytes(pGlobalOffset, atIndex: 1)
+            command.setBytes(pNbNeurons, atIndex: 2)
+            command.setBytes(pNbBatch, atIndex: 3)
+            command.setBytes(pSequence, atIndex: 4)
+            command.setBytes(pSequenceCache, atIndex: 5)
+            command.setBuffer(_getValueCacheOutputGPU()!.metal, atIndex: 6)
+            
+            command.dispatchThreads(
+                width: nbNeurons / coeff,
+                height: batchSize * cacheSeq
+            )
+            command.enqueue()
+        }
         
-        globalOffset += cacheSeq
+        globalOffset += cacheSeq % cacheSeqMax
+        // TODO: when using sliding window with an instruct model,
+        // it is risky to erase the header information!
+        // if cacheSeq >= cacheSeqMax
+        // {
+        //     globalOffset += 5
+        // }
         
-        pGlobalOffset = [UInt32(globalOffset)]
+        let pGlobalOffset = [UInt32(globalOffset)]
         
         command = metalKernel.createCommand(
             kernel, deviceID: deviceID
@@ -1755,7 +1776,7 @@ public class ValueCausalSeq: LayerMergeSeq
         command.setBytes(pNbBatch, atIndex: 3)
         command.setBytes(pSequence, atIndex: 4)
         command.setBytes(pSequenceValue, atIndex: 5)
-        command.setBuffer(_cacheValueTmp.metal, atIndex: 6)
+        command.setBuffer(_getValueCacheOutputGPU()!.metal, atIndex: 6)
         
         command.dispatchThreads(
             width: nbNeurons / coeff,
@@ -1764,6 +1785,67 @@ public class ValueCausalSeq: LayerMergeSeq
         command.enqueue()
     }
     
+    ///
+    /// Get value cache buffer to use as input in Metal kernel.
+    ///
+    /// - Returns: value cache to use as input.
+    ///
+    private func _getValueCacheInputGPU() -> FloatBuffer?
+    {
+        if cacheSeq != nil
+        {
+            if cacheSeq % 2 == 0
+            {
+                return _cacheValueTmp
+            }
+            else
+            {
+                return cacheValue
+            }
+        }
+        return nil
+    }
+    
+    ///
+    /// Get value cache buffer to use as input in Metal kernel.
+    ///
+    /// - Returns: value cache to use as input.
+    ///
+    private func _getValueCacheOutputGPU() -> FloatBuffer?
+    {
+        if cacheSeq != nil
+        {
+            if batchSize == 1
+            {
+                return cacheValue
+            }
+            else
+            {
+                if cacheSeq >= cacheSeqMax  // sliding window
+                {
+                    // The cache key has not changed.
+                    if (cacheSeqMax - 1) % 2 == 0
+                    {
+                        return cacheValue
+                    }
+                    else
+                    {
+                        return _cacheValueTmp
+                    }
+                }
+                else if cacheSeq % 2 == 0
+                {
+                    return cacheValue
+                }
+                else
+                {
+                    return _cacheValueTmp
+                }
+            }
+        }
+        return nil
+    }
+    
     /// Apply the forward pass in the GPU execution context.
     private func _forwardGPU()
     {
diff --git a/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal b/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal
index decae419..d20a6a9b 100644
--- a/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal
+++ b/Sources/GrAIdient/Metal/Kernel/NLPFloat.metal
@@ -714,7 +714,7 @@ kernel void valueCausalSeqGenerateFloat(
     uint depthValue = j + headValue * size;
     
     float tmp = 0.0;
-    for (uint seqK=0; seqK<=sequence; seqK++)
+    for (uint seqK=0; seqK<sequence; seqK++)
     {
         uint offsetValue = depthValue +
             nbNeuronsPrevValue * seqK + sequence * nbNeuronsPrevValue * elem;
@@ -760,7 +760,7 @@ kernel void valueCausalSeq4GenerateFloat(
     uint depthValue = j * 4 + headValue * size;
     
     float4 tmp = 0.0;
-    for (uint seqK=0; seqK<=sequence; seqK++)
+    for (uint seqK=0; seqK<sequence; seqK++)
     {
         uint offsetValue = (depthValue +
             nbNeuronsPrevValue * seqK +
diff --git a/Sources/GrAIdient/Metal/Kernel/NLPHalf.metal b/Sources/GrAIdient/Metal/Kernel/NLPHalf.metal
index 480a4d96..e2f1dea1 100644
--- a/Sources/GrAIdient/Metal/Kernel/NLPHalf.metal
+++ b/Sources/GrAIdient/Metal/Kernel/NLPHalf.metal
@@ -716,7 +716,7 @@ kernel void valueCausalSeqGenerateHalf(
     uint depthValue = j + headValue * size;
     
     half tmp = 0.0;
-    for (uint seqK=0; seqK<=sequence; seqK++)
+    for (uint seqK=0; seqK<sequence; seqK++)
     {
         uint offsetValue = depthValue +
             nbNeuronsPrevValue * seqK + sequence * nbNeuronsPrevValue * elem;
@@ -762,7 +762,7 @@ kernel void valueCausalSeq4GenerateHalf(
     uint depthValue = j * 4 + headValue * size;
     
     half4 tmp = 0.0;
-    for (uint seqK=0; seqK<=sequence; seqK++)
+    for (uint seqK=0; seqK<sequence; seqK++)
     {
         uint offsetValue = (depthValue +
             nbNeuronsPrevValue * seqK +
diff --git a/Tests/GrAIExamples/LLMExample.swift b/Tests/GrAIExamples/LLMExample.swift
index 5feca776..43cec793 100644
--- a/Tests/GrAIExamples/LLMExample.swift
+++ b/Tests/GrAIExamples/LLMExample.swift
@@ -371,6 +371,7 @@ final class LLMExample: XCTestCase
             else if let layerTmp = layer as? SoftmaxCausalSeq
             {
                 layerTmp.cacheSeq = nbTokens
+                layerTmp.cacheSeqMax = seqMax
             }
             else if let layerTmp = layer as? ValueCausalSeq
             {
diff --git a/Tests/GrAITests/NLPTests.swift b/Tests/GrAITests/NLPTests.swift
index a7df6266..eb00eee8 100644
--- a/Tests/GrAITests/NLPTests.swift
+++ b/Tests/GrAITests/NLPTests.swift
@@ -1453,6 +1453,7 @@ class NLPGenerateTests: XCTestCase
             else if let layerTmp = layer as? SoftmaxCausalSeq
             {
                 layerTmp.cacheSeq = nbTokens
+                layerTmp.cacheSeqMax = seqMax
             }
             else if let layerTmp = layer as? ValueCausalSeq
             {
@@ -1560,7 +1561,7 @@ class NLPGenerateTests: XCTestCase
         model1.updateKernel(batchSize: 1)
         let prompt1 = [Int](0..<sequence)
         try! firstLayer1.setDataGPU(
-            [prompt1], batchSize: 1, sequence: prompt1.count
+            [prompt1], batchSize: 1, sequence: sequence
         )
         try! model1.forward()
         
@@ -1583,7 +1584,7 @@ class NLPGenerateTests: XCTestCase
         let prompt2 = [Int](prompt1[0..<tmpSeq])
         
         try! firstLayer2.setDataGPU(
-            [prompt2], batchSize: 1, sequence: prompt2.count
+            [prompt2], batchSize: 1, sequence: tmpSeq
         )
         try! model2.forward()
         
@@ -1649,6 +1650,487 @@ class NLPGenerateTests: XCTestCase
         XCTAssert(predictions1 == predictions2)
     }
     
+    ///
+    /// Predict tokens from prompt with two ways, using batch size greater than 1.
+    /// 1. Use end to end forward pass.
+    /// 2. Use partial end to end forward pass followed by generation one token at a time.
+    ///
+    func runGenerateBatchSize()
+    {
+        let nbBlocks = 1
+        let hiddenDim = 8
+        let headDim = 2
+        let mlpDim = 8
+        let nbHeadsQuery = 4
+        let nbHeadsKV = 2
+        let vocabularySize = 10
+        let maxTokens = 5 // maximal number of tokens to generate
+        let tmpSeq = 2 // partial forward step
+        
+        // Build models.
+        let model1 = buildModel(
+            sequence: sequence,
+            nbBlocks: nbBlocks,
+            hiddenDim: hiddenDim,
+            headDim: headDim,
+            mlpDim: mlpDim,
+            nbHeadsQuery: nbHeadsQuery,
+            nbHeadsKV: nbHeadsKV,
+            vocabularySize: vocabularySize
+        )
+        var model2 = buildModel(
+            sequence: tmpSeq,
+            nbBlocks: nbBlocks,
+            hiddenDim: hiddenDim,
+            headDim: headDim,
+            mlpDim: mlpDim,
+            nbHeadsQuery: nbHeadsQuery,
+            nbHeadsKV: nbHeadsKV,
+            vocabularySize: vocabularySize
+        )
+        
+        // Initialize for inference.
+        model1.initKernel(phase: .Inference)
+        model2.weights = model1.weights
+        model2.initKernel(phase: .Inference)
+        
+        let firstLayer1 = model1.layers.first as! EmbeddingSeq
+        var firstLayer2 = model2.layers.first as! EmbeddingSeq
+        
+        // Forward.
+        model1.updateKernel(batchSize: 2)
+        let prompt1 = [Int](0..<sequence)
+        let prompt2 = [Int](prompt1.reversed())
+        
+        try! firstLayer1.setDataGPU(
+            [prompt1, prompt2], batchSize: 2, sequence: sequence
+        )
+        try! model1.forward()
+        
+        // Get result.
+        let out1 = (model1.layers.last as! LayerSeq).outs.download()
+        
+        // Compute prediction for each token.
+        var predictions1 = [Int]()
+        for seq in 0..<out1.count / vocabularySize
+        {
+            let vector = [Float](
+                out1[vocabularySize*seq..<vocabularySize*(seq+1)]
+            )
+            let argmaxTmp = argmax(array: vector)!
+            predictions1.append(argmaxTmp)
+        }
+        
+        // Forward.
+        model2.updateKernel(batchSize: 2)
+        let prompt3 = [Int](prompt1[0..<tmpSeq])
+        let prompt4 = [Int](prompt2[0..<tmpSeq])
+        
+        try! firstLayer2.setDataGPU(
+            [prompt3, prompt4], batchSize: 2, sequence: tmpSeq
+        )
+        try! model2.forward()
+        
+        // Get result.
+        let out2 = (model2.layers.last as! LayerSeq).outs.download()
+        
+        // Compute prediction for each token.
+        var predictions2 = [Int](repeating: 0, count: 2 * sequence)
+        for seq in 0..<out2.count / vocabularySize
+        {
+            let vector = [Float](
+                out2[vocabularySize*seq..<vocabularySize*(seq+1)]
+            )
+            let argmaxTmp = argmax(array: vector)!
+            
+            let offset = seq % tmpSeq + (seq / tmpSeq) * sequence
+            predictions2[offset] = argmaxTmp
+        }
+        
+        var nbTokens = tmpSeq
+        
+        // Prepare model for generation.
+        let cache = prepareForGeneration(
+            model: model2,
+            nbTokens: nbTokens,
+            seqMax: maxTokens
+        )
+        
+        // Update model's sequence.
+        model2 = Model.updateSeq(
+            models: [model2],
+            sequence: 1,
+            inPlace: true
+        )[0]
+        model2.phase = .Inference
+        model2.updateKernel(batchSize: 2)
+        
+        // Set cache.
+        firstLayer2 = model2.layers.first as! EmbeddingSeq
+        setCache(
+            model: model2,
+            cache: cache
+        )
+        
+        // Generate.
+        let finalStep = maxTokens - nbTokens
+        for i in 0..<finalStep
+        {
+            // Forward.
+            try! firstLayer2.setDataGPU(
+                [[prompt1[tmpSeq + i]], [prompt2[tmpSeq + i]]],
+                batchSize: 2, sequence: 1
+            )
+            updateRoPE(model: model2, curSeq: nbTokens + 1)
+            try! model2.forward()
+            
+            // Get result.
+            let out2 = (model2.layers.last as! LayerSeq).outs.download()
+            
+            // Compute prediction for each token.
+            for seq in 0..<out2.count / vocabularySize
+            {
+                let vector = [Float](
+                    out2[vocabularySize*seq..<vocabularySize*(seq+1)]
+                )
+                let argmaxTmp = argmax(array: vector)!
+                
+                let offset = tmpSeq + i + (seq % 2) * sequence
+                predictions2[offset] = argmaxTmp
+            }
+            
+            nbTokens += 1
+        }
+        
+        print("Predictions1: \(predictions1).")
+        print("Predictions2: \(predictions2).")
+        XCTAssert(predictions1 == predictions2)
+    }
+    
+    /// Predict tokens with sliding window.
+    func runGenerateSlidingWindow()
+    {
+        let nbBlocks = 1
+        let hiddenDim = 8
+        let headDim = 2
+        let mlpDim = 8
+        let nbHeadsQuery = 4
+        let nbHeadsKV = 2
+        let vocabularySize = 10
+        let maxTokens = 5 // maximal number of tokens to generate
+        let tmpSeq = 2 // partial forward step
+        
+        // Build models.
+        var model = buildModel(
+            sequence: tmpSeq,
+            nbBlocks: nbBlocks,
+            hiddenDim: hiddenDim,
+            headDim: headDim,
+            mlpDim: mlpDim,
+            nbHeadsQuery: nbHeadsQuery,
+            nbHeadsKV: nbHeadsKV,
+            vocabularySize: vocabularySize
+        )
+        
+        // Initialize for inference.
+        model.initKernel(phase: .Inference)
+        
+        var firstLayer = model.layers.first as! EmbeddingSeq
+        let prompt1 = [Int](0..<sequence)
+        
+        // Forward.
+        model.updateKernel(batchSize: 1)
+        let prompt2 = [Int](prompt1[0..<tmpSeq])
+        
+        try! firstLayer.setDataGPU(
+            [prompt2], batchSize: 1, sequence: tmpSeq
+        )
+        try! model.forward()
+        
+        // Get result.
+        let out = (model.layers.last as! LayerSeq).outs.download()
+        
+        // Compute prediction for each token.
+        var tokens = [Int]()
+        for seq in 0..<out.count / vocabularySize
+        {
+            let vector = [Float](
+                out[vocabularySize*seq..<vocabularySize*(seq+1)]
+            )
+            let argmaxTmp = argmax(array: vector)!
+            tokens.append(argmaxTmp)
+        }
+        
+        var lastToken = tokens.last!
+        var nbTokens = tokens.count
+        
+        // Prepare model for generation.
+        let cache = prepareForGeneration(
+            model: model,
+            nbTokens: nbTokens,
+            seqMax: maxTokens
+        )
+        
+        // Update model's sequence.
+        model = Model.updateSeq(
+            models: [model],
+            sequence: 1,
+            inPlace: true
+        )[0]
+        model.phase = .Inference
+        model.updateKernel(batchSize: 1)
+        
+        // Set cache.
+        firstLayer = model.layers.first as! EmbeddingSeq
+        setCache(
+            model: model,
+            cache: cache
+        )
+        
+        var scoreLayer: LayerSeq! = nil
+        for layer in model.layers
+        {
+            if let layerTmp = layer as? ValueCausalSeq
+            {
+                scoreLayer = layerTmp.layersPrev[1] as? LayerSeq
+                break
+            }
+        }
+        
+        // Generate.
+        let finalStep = 2 * maxTokens - nbTokens
+        for i in 0..<finalStep
+        {
+            // Forward.
+            try! firstLayer.setDataGPU(
+                [[lastToken]], batchSize: 1, sequence: 1
+            )
+            updateRoPE(model: model, curSeq: nbTokens + 1)
+            try! model.forward()
+            
+            // Test that sum of scores equal to 1.
+            let scores = scoreLayer.outs.download()
+            var sum = 0.0
+            for (j, score) in scores.enumerated()
+            {
+                sum += Double(score)
+                
+                // Every seqK is not yet used: we still have 0.0 in the
+                // context cache.
+                if (j + 1) == scores.count && i < maxTokens - tmpSeq - 1
+                {
+                    XCTAssert(sum == 0.0)
+                }
+                // Every seqK is used: there should not be any 0.0 as
+                // the context cache is full.
+                else if (j + 1) == scores.count
+                {
+                    let value = round(sum * 100) / 100.0
+                    XCTAssert(value == 1.0)
+                }
+                // Nominal case, we are feeding `sum`.
+                else if (j + 1) % (min(nbTokens + 1, maxTokens)) == 0
+                {
+                    if sum != 0.0
+                    {
+                        let value = round(sum * 100) / 100.0
+                        XCTAssert(value == 1.0)
+                    }
+                    sum = 0.0
+                }
+            }
+            
+            // Get result.
+            let out = (model.layers.last as! LayerSeq).outs.download()
+            
+            lastToken = argmax(array: out)!
+            tokens.append(lastToken)
+            nbTokens += 1
+        }
+        print("Tokens: \(tokens).")
+    }
+    
+    /// Predict tokens with sliding window and batch size greater than 1.
+    func runGenerateSlidingWindowBatchSize()
+    {
+        let nbBlocks = 1
+        let hiddenDim = 8
+        let headDim = 2
+        let mlpDim = 8
+        let nbHeadsQuery = 4
+        let nbHeadsKV = 2
+        let vocabularySize = 10
+        let maxTokens = 5 // maximal number of tokens to generate
+        let tmpSeq = 2 // partial forward step
+        
+        // Build models.
+        var model = buildModel(
+            sequence: tmpSeq,
+            nbBlocks: nbBlocks,
+            hiddenDim: hiddenDim,
+            headDim: headDim,
+            mlpDim: mlpDim,
+            nbHeadsQuery: nbHeadsQuery,
+            nbHeadsKV: nbHeadsKV,
+            vocabularySize: vocabularySize
+        )
+        
+        // Initialize for inference.
+        model.initKernel(phase: .Inference)
+        
+        var firstLayer = model.layers.first as! EmbeddingSeq
+        let prompt1 = [Int](0..<sequence)
+        let prompt2 = [Int](prompt1.reversed())
+        
+        // Forward.
+        model.updateKernel(batchSize: 2)
+        let prompt3 = [Int](prompt1[0..<tmpSeq])
+        let prompt4 = [Int](prompt2[0..<tmpSeq])
+        
+        try! firstLayer.setDataGPU(
+            [prompt3, prompt4], batchSize: 2, sequence: tmpSeq
+        )
+        try! model.forward()
+        
+        // Get result.
+        let out = (model.layers.last as! LayerSeq).outs.download()
+        
+        // Compute prediction for each token.
+        var tokens = [Int](repeating: 0, count: 4 * maxTokens)
+        for seq in 0..<out.count / vocabularySize
+        {
+            let vector = [Float](
+                out[vocabularySize*seq..<vocabularySize*(seq+1)]
+            )
+            let argmaxTmp = argmax(array: vector)!
+            
+            let offset = seq % tmpSeq + (seq / tmpSeq) * 2 * maxTokens
+            tokens[offset] = argmaxTmp
+        }
+        
+        var lastToken1 = tokens[tmpSeq-1]
+        var lastToken2 = tokens[2 * maxTokens + tmpSeq - 1]
+        var nbTokens = tmpSeq
+        
+        // Prepare model for generation.
+        let cache = prepareForGeneration(
+            model: model,
+            nbTokens: nbTokens,
+            seqMax: maxTokens
+        )
+        
+        // Update model's sequence.
+        model = Model.updateSeq(
+            models: [model],
+            sequence: 1,
+            inPlace: true
+        )[0]
+        model.phase = .Inference
+        model.updateKernel(batchSize: 2)
+        
+        // Set cache.
+        firstLayer = model.layers.first as! EmbeddingSeq
+        setCache(
+            model: model,
+            cache: cache
+        )
+        
+        var score1Layer: LayerSeq! = nil
+        for layer in model.layers
+        {
+            if let layerTmp = layer as? QueryCausalSeq
+            {
+                score1Layer = layerTmp
+                break
+            }
+        }
+        
+        var score2Layer: LayerSeq! = nil
+        for layer in model.layers
+        {
+            if let layerTmp = layer as? ValueCausalSeq
+            {
+                score2Layer = layerTmp.layersPrev[1] as? LayerSeq
+                break
+            }
+        }
+        
+        // Generate.
+        let finalStep = 2 * maxTokens - nbTokens
+        for i in 0..<finalStep
+        {
+            // Forward.
+            try! firstLayer.setDataGPU(
+                [[lastToken1], [lastToken2]], batchSize: 2, sequence: 1
+            )
+            updateRoPE(model: model, curSeq: nbTokens + 1)
+            try! model.forward()
+            
+            // Test that all scores are set when the context cache is full.
+            var scores = score1Layer.outs.download()
+            if i >= maxTokens - tmpSeq - 1
+            {
+                for score in scores
+                {
+                    XCTAssert(score != 0.0)
+                }
+            }
+            
+            // Test that sum of scores equal to 1.
+            scores = score2Layer.outs.download()
+            var sum = 0.0
+            for (j, score) in scores.enumerated()
+            {
+                sum += Double(score)
+                
+                // Every seqK is not yet used: we still have 0.0 in the
+                // context cache.
+                if (j + 1) == scores.count && i < maxTokens - tmpSeq - 1
+                {
+                    XCTAssert(sum == 0.0)
+                }
+                // Every seqK is used: there should not be any 0.0 as
+                // the context cache is full.
+                else if (j + 1) == scores.count
+                {
+                    let value = round(sum * 100) / 100.0
+                    XCTAssert(value == 1.0)
+                }
+                // Nominal case, we are feeding `sum`.
+                else if (j + 1) % (min(nbTokens + 1, maxTokens)) == 0
+                {
+                    if sum != 0.0
+                    {
+                        let value = round(sum * 100) / 100.0
+                        XCTAssert(value == 1.0)
+                    }
+                    sum = 0.0
+                }
+            }
+            
+            // Get result.
+            let out = (model.layers.last as! LayerSeq).outs.download()
+            
+            // Compute prediction for each token.
+            for seq in 0..<out.count / vocabularySize
+            {
+                let vector = [Float](
+                    out[vocabularySize*seq..<vocabularySize*(seq+1)]
+                )
+                let argmaxTmp = argmax(array: vector)!
+                
+                let offset = tmpSeq + i + (seq % 2) * 2 * maxTokens
+                tokens[offset] = argmaxTmp
+            }
+            
+            lastToken1 = tokens[tmpSeq + i]
+            lastToken2 = tokens[tmpSeq + i + 2 * maxTokens]
+            
+            nbTokens += 1
+        }
+        print("Tokens: \(tokens).")
+    }
+    
     func testGenerateFloat()
     {
         runGenerate()
@@ -1656,8 +2138,40 @@ class NLPGenerateTests: XCTestCase
     
     func testGenerateFloat16() throws
     {
-        throw XCTSkip("Skipping this test because of precision issue.")
         GrAI.Precision.float16 = true
         runGenerate()
     }
+    
+    func testGenerateBatchSizeFloat()
+    {
+        runGenerateBatchSize()
+    }
+    
+    func testGenerateBatchSizeFloat16() throws
+    {
+        GrAI.Precision.float16 = true
+        runGenerateBatchSize()
+    }
+    
+    func testGenerateSlidingWindowFloat()
+    {
+        runGenerateSlidingWindow()
+    }
+    
+    func testGenerateSlidingWindowFloat16() throws
+    {
+        GrAI.Precision.float16 = true
+        runGenerateSlidingWindow()
+    }
+    
+    func testGenerateSlidingWindowBatchSizeFloat()
+    {
+        runGenerateSlidingWindowBatchSize()
+    }
+    
+    func testGenerateSlidingWindowBatchSizeFloat16() throws
+    {
+        GrAI.Precision.float16 = true
+        runGenerateSlidingWindowBatchSize()
+    }
 }