PlummersSoftwareLLC · rbergen · Aug 14, 2023 · Aug 3, 2023 · Aug 7, 2023 · Aug 14, 2023
diff --git a/BENCHMARK.md b/BENCHMARK.md
@@ -1,10 +1,11 @@
 # Benchmarks <!-- omit in toc -->
 
-This document describes how to build and run all benchmarks (solutions) for different operating systems that can be automatically executed. 
+This document describes how to build and run all benchmarks (solutions) for different operating systems that can be automatically executed.
 
 Some solutions are not included in the automated benchmark runs, either because no Dockerfile is included in the solution, or due to legal reasons. We do strive to include all solutions in the automated benchmark runs, if it is technically and legally possible to do so.
 
 ## Table of contents <!-- omit in toc -->
+
 - [What operating system to use?](#what-operating-system-to-use)
 - [General working mechanism](#general-working-mechanism)
 - [Linux](#linux)
@@ -349,8 +350,9 @@ make DIRECTORY=PrimeCPP
 ## Running in unconfined mode
 
 For some interpreted languages (Python, Ruby, NodeJS), docker has a non-zero effect slowing CPU-intensive code.
-See https://github.com/moby/moby/issues/41389 for the related docker issue. You can disable some of the sandboxing
+See <https://github.com/moby/moby/issues/41389> for the related docker issue. You can disable some of the sandboxing
 to obtain near-native performance (at least on Linux) with the `UNCONFINED=1` option:
+
 ```bash
 make UNCONFINED=1
 make DIRECTORY=PrimeMyFavoriteInterpretedLanguage UNCONFINED=1
@@ -361,15 +363,15 @@ make DIRECTORY=PrimeMyFavoriteInterpretedLanguage UNCONFINED=1
 The benchmark suite supports multiple output formats; if no formatter is specified, it will default to the `table` format.
 Here are the supported values:
 
-* table
-* chart
-* csv
-* json (this also includes the machine information data)
-* minifiedjson (same as json, but not pretty-printed)
+- table
+- chart
+- csv
+- json (this also includes the machine information data)
+- minifiedjson (same as json, but not pretty-printed)
 
 The output format can be controlled via the `FORMATTER` variable like this:
 
-```
+```shell
 make FORMATTER=json
 make DIRECTORY=PrimeCrystal/solution_1 FORMATTER=csv
 ```
diff --git a/PrimeCUDA/solution_2/primes.cu b/PrimeCUDA/solution_2/primes.cu
@@ -25,27 +25,27 @@ __global__ void initialize_buffer(uint64_t blockSize, uint64_t wordCount, sieve_
 __global__ void unmark_multiples_threads(uint32_t primeCount, uint32_t *primes, uint64_t halfSize, uint32_t sizeSqrt, sieve_t *sieve)
 {
     // We unmark every "MAX_THREADS"th prime's multiples, starting with our thread index
-    for (uint32_t primeIndex = threadIdx.x; primeIndex < primeCount; primeIndex += MAX_THREADS) 
+    for (uint32_t primeIndex = threadIdx.x; primeIndex < primeCount; primeIndex += MAX_THREADS)
     {
         const uint32_t prime = primes[primeIndex];
         const uint64_t primeSquared = uint64_t(prime) * prime;
 
-        // Unmark multiples starting at just beyond the square root of the sieve size or the square of the prime, 
+        // Unmark multiples starting at just beyond the square root of the sieve size or the square of the prime,
         //   whichever is larger.
         uint64_t firstUnmarked = primeSquared > sizeSqrt ? primeSquared : ((sizeSqrt / prime + 1) * prime);
         // We're marking off odd multiples only, so make sure we start with one of those!
         if (!(firstUnmarked & 1))
             firstUnmarked += prime;
 
-        for (uint64_t index = firstUnmarked >> 1; index <= halfSize; index += prime) 
-            // Clear the bit in the word that corresponds to the last part of the index 
+        for (uint64_t index = firstUnmarked >> 1; index <= halfSize; index += prime)
+            // Clear the bit in the word that corresponds to the last part of the index
             atomicAnd(&sieve[WORD_INDEX(index)], ~(sieve_t(1) << BIT_INDEX(index)));
     }
 }
 
 __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, uint64_t halfSize, uint32_t sizeSqrt, uint32_t maxBlockIndex, uint64_t blockSize, sieve_t *sieve)
 {
-    // Calculate the start and end of the block we need to work on, at buffer word boundaries. 
+    // Calculate the start and end of the block we need to work on, at buffer word boundaries.
     //   Note that the first variable is a number in sieve space...
     uint64_t blockStart = uint64_t(blockIdx.x) * blockSize + sizeSqrt;
     //   ...and the second is an index in the sieve buffer (representing odd numbers only)
@@ -64,7 +64,7 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
         const uint32_t prime = primes[primeIndex];
         const uint64_t primeSquared = uint64_t(prime) * prime;
 
-        // Unmark multiples starting at just beyond the start of our block or the square of the prime, 
+        // Unmark multiples starting at just beyond the start of our block or the square of the prime,
         //   whichever is larger.
         uint64_t firstUnmarked = primeSquared >= blockStart ? primeSquared : ((blockStart / prime + 1) * prime);
         // We're marking off odd multiples only, so make sure we start with one of those!
@@ -79,18 +79,18 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
                 continue;
 
             uint64_t wordIndex = WORD_INDEX(index);
-            uint32_t bitIndex = BIT_INDEX(index);                
+            uint32_t bitIndex = BIT_INDEX(index);
             sieve_t bitMask = 0;
 
             do
             {
                 // Check if our bit index has moved past the current word's bits. If so...
-                if (bitIndex > MAX_BIT_INDEX) 
+                if (bitIndex > MAX_BIT_INDEX)
                 {
                     // ...clear the current word's bits that are set in the mask, and move on the next word.
                     sieve[wordIndex++] &= ~bitMask;
                     // "Shift bitmask one word to the right" through calculation. It has to be done that way
-                    //   in part because our word length may be the maximum the GPU supports (64 bits). 
+                    //   in part because our word length may be the maximum the GPU supports (64 bits).
                     bitIndex %= BITS_PER_WORD;
                     bitMask = sieve_t(1) << bitIndex;
                 }
@@ -111,8 +111,8 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
         {
         #endif // ROLLING_LIMIT > 0
 
-            for (uint64_t index = firstUnmarked >> 1; index <= lastIndex; index += prime) 
-                // Clear the bit in the word that corresponds to the last part of the index 
+            for (uint64_t index = firstUnmarked >> 1; index <= lastIndex; index += prime)
+                // Clear the bit in the word that corresponds to the last part of the index
                 sieve[WORD_INDEX(index)] &= ~(sieve_t(1) << BIT_INDEX(index));
 
         #if ROLLING_LIMIT > 0
@@ -121,7 +121,7 @@ __global__ void unmark_multiples_blocks(uint32_t primeCount, uint32_t *primes, u
     }
 }
 
-class Sieve 
+class Sieve
 {
     const uint64_t sieve_size;
     const uint64_t half_size;
@@ -131,7 +131,7 @@ class Sieve
     sieve_t *device_sieve_buffer;
     sieve_t *host_sieve_buffer;
 
-    void unmark_multiples(Parallelization type, uint32_t primeCount, uint32_t *primeList) 
+    void unmark_multiples(Parallelization type, uint32_t primeCount, uint32_t *primeList)
     {
         // Copy the first (square root of sieve size) buffer bytes to the device
         cudaMemcpy(device_sieve_buffer, host_sieve_buffer, (size_sqrt >> 4) + 1, cudaMemcpyHostToDevice);
@@ -165,10 +165,10 @@ class Sieve
                 // ...and increase that if the division left a remainder.
                 if (sieveSpace & SIEVE_BITS_MASK)
                     wordCount++;
-                
+
                 // The number of blocks is the maximum thread count or the number of words, whichever is lower
                 const uint32_t blockCount = (uint32_t)min(uint64_t(MAX_THREADS), wordCount);
-                
+
                 uint64_t blockSize = sieveSpace / blockCount;
                 // Increase block size if the calculating division left a remainder
                 if (sieveSpace % blockCount)
@@ -187,13 +187,13 @@ class Sieve
                 fprintf(stderr, "WARNING: Parallelization type %d unknown, multiple unmarking skipped!\n\n", to_underlying(type));
             break;
         }
-        
+
         // Release the device prime list buffer
         cudaFree(devicePrimeList);
 
         // Copy the sieve buffer from the device to the host. This function implies a wait for all GPU threads to finish.
         cudaMemcpy(host_sieve_buffer, device_sieve_buffer, buffer_byte_size, cudaMemcpyDeviceToHost);
-        
+
         #ifdef DEBUG
         printf("- device to host copy of sieve buffer complete.\n");
         #endif
@@ -217,7 +217,7 @@ class Sieve
 
         // The number of blocks is the maximum number of threads or the number of words in the buffer, whichever is lower
         const uint32_t blockCount = (uint32_t)min(uint64_t(MAX_THREADS), buffer_word_size);
-        
+
         uint64_t blockSize = buffer_word_size / blockCount;
         // Increase block size if the calculating division left a remainder
         if (buffer_word_size % blockCount)
@@ -229,7 +229,7 @@ class Sieve
 
         initialize_buffer<<<blockCount, 1>>>(blockSize, buffer_word_size, device_sieve_buffer);
 
-        // Allocate host sieve buffer (odd numbers only) and initialize the bytes up to the square root of the sieve 
+        // Allocate host sieve buffer (odd numbers only) and initialize the bytes up to the square root of the sieve
         //   size to all 1s.
         host_sieve_buffer = (sieve_t *)malloc(buffer_byte_size);
         memset(host_sieve_buffer, 255, (size_sqrt >> 4) + 1);
@@ -242,7 +242,7 @@ class Sieve
         #endif
     }
 
-    ~Sieve() 
+    ~Sieve()
     {
         cudaFree(device_sieve_buffer);
         free(host_sieve_buffer);
@@ -267,7 +267,7 @@ class Sieve
         {
             uint64_t index = factor >> 1;
 
-            if (host_sieve_buffer[WORD_INDEX(index)] & (sieve_t(1) << BIT_INDEX(index))) 
+            if (host_sieve_buffer[WORD_INDEX(index)] & (sieve_t(1) << BIT_INDEX(index)))
             {
                 primeList[primeCount++] = factor;
 
@@ -283,36 +283,40 @@ class Sieve
         return host_sieve_buffer;
     }
 
-    uint64_t count_primes() 
+    uint64_t count_primes()
     {
         uint64_t primeCount = 0;
         const uint64_t lastWord = WORD_INDEX(half_size);
         sieve_t word;
 
         // For all buffer words except the last one, just count the set bits in the word until there are none left.
         //   We only hold bits for odd numbers in the sieve buffer. However, due to a small "mathematical coincidence"
-        //   bit 0 of word 0 effectively represents the only even prime 2. This means the "count set bits" approach 
+        //   bit 0 of word 0 effectively represents the only even prime 2. This means the "count set bits" approach
         //   in itself yields the correct result.
         for (uint64_t index = 0; index < lastWord; index++)
         {
             word = host_sieve_buffer[index];
-            while (word) 
-            {
-                if (word & 1)
-                    primeCount++;
 
-                word >>= 1;
+            // This is Brian Kernighan's algorithm for counting set bits. It uses the fact that subtracting 1 from an
+            //   integer flips all bits right of the right-most set bit, and that set bit itself. So, doing a bitwise
+            //   AND of an integer with "itself minus one" clears the right-most set bit. This reduces the number of
+            //   loop iterations to the number of set bits. There is a one-statement implementation for this using a
+            //   for loop, but we'll be kind to our readers. :)
+            while (word)
+            {
+                word &= word - 1;
+                primeCount++;
             }
         }
 
         // For the last word, only count bits up to the (halved) sieve limit
         word = host_sieve_buffer[lastWord];
         const uint32_t lastBit = BIT_INDEX(half_size);
-        for (uint32_t index = 0; word && index <= lastBit; index++) 
+        for (uint32_t index = 0; word && index <= lastBit; index++)
         {
             if (word & 1)
                 primeCount++;
-            
+
             word >>= 1;
         }
 
@@ -334,7 +338,7 @@ const std::map<uint64_t, const int> resultsDictionary =
     { 10'000'000'000UL, 455052511 },
 };
 
-const std::map<Parallelization, const char *> parallelizationDictionary = 
+const std::map<Parallelization, const char *> parallelizationDictionary =
 {
     { Parallelization::threads, "threads" },
     { Parallelization::blocks,  "blocks"  }
@@ -348,12 +352,12 @@ uint64_t determineSieveSize(int argc, char *argv[])
 
     const uint64_t sieveSize = strtoul(argv[1], nullptr, 0);
 
-    if (sieveSize == 0) 
+    if (sieveSize == 0)
         return DEFAULT_SIEVE_SIZE;
 
     if (resultsDictionary.find(sieveSize) == resultsDictionary.end())
         fprintf(stderr, "WARNING: Results cannot be validated for selected sieve size of %zu!\n\n", sieveSize);
-    
+
     return sieveSize;
 }
 
@@ -364,7 +368,7 @@ void printResults(Parallelization type, uint64_t sieveSize, uint64_t primeCount,
     const auto parallelizationEntry = parallelizationDictionary.find(type);
     const char *parallelizationLabel = parallelizationEntry != parallelizationDictionary.end() ? parallelizationEntry->second : "unknown";
 
-    fprintf(stderr, "Passes: %zu, Time: %lf, Avg: %lf, Word size: %d, Max GPU threads: %d, Type: %s, Limit: %zu, Count: %zu, Validated: %d\n", 
+    fprintf(stderr, "Passes: %zu, Time: %lf, Avg: %lf, Word size: %d, Max GPU threads: %d, Type: %s, Limit: %zu, Count: %zu, Validated: %d\n",
             passes,
             duration,
             duration / passes,
@@ -411,15 +415,15 @@ int main(int argc, char *argv[])
         }
         while (duration_cast<seconds>(runTime).count() < 5);
         #endif
-        
+
         #ifdef DEBUG
         printf("\n");
         #endif
 
         const size_t primeCount = sieve->count_primes();
-        
+
         delete sieve;
 
-        printResults(type, sieveSize, primeCount, duration_cast<microseconds>(runTime).count() / 1000000.0, passes); 
+        printResults(type, sieveSize, primeCount, duration_cast<microseconds>(runTime).count() / 1000000.0, passes);
     }
 }
diff --git a/PrimeCUDA/solution_2/primes.h b/PrimeCUDA/solution_2/primes.h
@@ -1,6 +1,6 @@
 #define DEFAULT_SIEVE_SIZE 1'000'000
 
-// Maximum number of parallel threads used on the GPU. The actual number of threads used can be lower, 
+// Maximum number of parallel threads used on the GPU. The actual number of threads used can be lower,
 //   if there aren't enough "chunks of work" to keep this number of threads busy.
 #define MAX_THREADS 256
 
@@ -9,7 +9,7 @@
 
 // Highest prime number for which we'll use a rolling bit mask. This value cannot be higher than the
 //   number of bits per word. Set to 0 to disable the use of a rolling bit mask. The relevant code won't
-//   be compiled then, thus reducing performance side effects to zero. 
+//   be compiled then, thus reducing performance side effects to zero.
 #define ROLLING_LIMIT 29
 
 // If defined, the code will show debug output and run both unmarking methods once.
@@ -28,11 +28,11 @@
 #endif
 
 #if BITS_PER_WORD == 32
-    typedef unsigned int sieve_t;
+    typedef uint32_t sieve_t;
     #define MAX_WORD_VALUE UINT32_MAX
     #define WORD_SHIFT 5
 #elif BITS_PER_WORD == 64
-    typedef unsigned long long sieve_t;
+    typedef uint64_t sieve_t;
     #define MAX_WORD_VALUE UINT64_MAX
     #define WORD_SHIFT 6
 #else
@@ -55,7 +55,7 @@ enum class Parallelization : char
 
 // We have to define this ourselves, as we're not doing C++23 (yet)
 template<class TEnum>
-constexpr auto to_underlying(TEnum enumValue)  
+constexpr auto to_underlying(TEnum enumValue)
 {
    return static_cast<typename std::underlying_type<TEnum>::type>(enumValue);
 }