modification in gpu code / exemple on how to compute prev_block and r…

…ef_block indexes * gpu code modification in fill_block routine to take first 256bit from the header in pos 8 9 and ref_block index in pos 7 * example of method to compute prev_block and ref_block from Y[l] (mtp_solver)
firoorg · May 8, 2018 · da53870 · da53870
1 parent 40ea9b8
commit da53870
Show file tree

Hide file tree

Showing 6 changed files with 253 additions and 51 deletions.
diff --git a/RUN-LBC-YIIMP.cmd b/RUN-LBC-YIIMP.cmd
@@ -2,7 +2,7 @@ rem Release\ccminer -a argon2 -o http://127.0.0.1:9989 -u dom -p password  --dev
 rem x64\Release\ccminer -a argon2 -o  stratum+tcp://xzc.suprnova.cc:1598   -u djm34.1 -p password    --cpu-priority 4    --device 1070  --debug 
 rem x64\Release\ccminer -a mtp -o  http://127.0.0.1:9898   -u djm34 -p password    --cpu-priority 4    --device 1070  
 x64\Release\ccminer -a mtp -o   stratum+tcp://xzc.suprnova.cc:1598  -u djm34.1 -p password    --cpu-priority 4    --device 1080
-
+x64\Release\ccminer -a mtp -o stratum+tcp://xmg.minerclaim.net:3333 -u kakaeli.voin -p voin   --device 1080
 rem Release\ccminer  -a lbry -o stratum+tcp://yiimp.ccminer.org:3334 -u bK2wcSFJv2nnNjFvTN5Q7VX8X8unJktJsa -p d=128,stats --cpu-priority 4 --protocol-dump --debug
 rem Release\ccminer -a m7 -o  stratum+tcp://xcn.suprnova.cc:8008 -u djm34.1 -p password    --cpu-priority 4 -d 1070 
 pause
diff --git a/argon2ref/ref.c b/argon2ref/ref.c
@@ -167,6 +167,7 @@ void fill_segment(const argon2_instance_t *instance,
         (instance->type == Argon2_id && (position.pass == 0) &&
          (position.slice < ARGON2_SYNC_POINTS / 2));
 
+
     if (data_independent_addressing) {
         init_block_value(&zero_block, 0);
         init_block_value(&input_block, 0);
@@ -218,12 +219,9 @@ truc++;
             }
             pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
         } else {
-            pseudo_rand = instance->memory[prev_offset].v[0];
+           pseudo_rand = instance->memory[prev_offset].v[0];
         }
-//if (i==starting_index)
-//printf("\n");
-//printf(" *** prevOffset %d ***",prev_offset);
-//	instance->lane_length, instance->segment_length,i,pseudo_rand);
+
         /* 1.2.2 Computing the lane of the reference block */
         ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
 
@@ -243,12 +241,15 @@ truc++;
         ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
                                 ref_lane == position.lane);
 
+
         /* 2 Creating a new block */
         ref_block =
             instance->memory + instance->lane_length * ref_lane + ref_index;
 
         curr_block = instance->memory + curr_offset;
 		uint64_t TheBlockIndex = instance->lane_length * ref_lane + ref_index;
+
+
 /*
 		zRefBlock  = &instance->TheRefBlock + curr_offset;
 		zPrevBlock = &instance->ThePrevBlock + curr_offset;

diff --git a/cuda_mtp/cuda_mtp.cu b/cuda_mtp/cuda_mtp.cu
@@ -732,7 +732,7 @@ uint32_t prev_block_offset, uint32_t ref_block_offset)
 
 	__syncthreads();
 
-
+ 
 	for (int i = 0; i<itu4; i++)
 	if ((shift + itu4 * (threadIdx.x % 32) + i)==8)
 		((uint4*)blockR)[shift + itu4 * (threadIdx.x % 32) + i] = ((uint4*)pData)[0];
@@ -815,6 +815,121 @@ uint32_t prev_block_offset, uint32_t ref_block_offset)
 #undef BLAKE2_ROUND_NOMSG
 }
 
+__device__ static void fill_block4_doubleshared_mtp_output_bhdr_idx(uint4  * output, const uint4  * __restrict__ block /*, uint32_t *blockHistory*/,
+	uint32_t prev_block_offset, uint32_t ref_block_offset)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32 pDat[4] = { 0 };
+	__shared__ uint2 blockR[128];
+	__shared__ uint2 block_tmp[128];
+//	uint4 IndexContainer = {ref_block_offset,0,0,0};
+	uint4 IndexContainer = {0,0,ref_block_offset,0};
+
+	uint32_t shift2 = 128 * (threadIdx.x / 32);
+	uint32_t shift = 64 * (threadIdx.x / 32);
+	uint32_t itu4 = 64 / 32; //(gpu_shared/8);
+	uint32_t itu2 = 128 / 32; //(gpu_shared/8);
+							  //	if (threadIdx.x < 32) return;
+
+	for (int i = 0; i<itu4; i++)
+		((uint4*)blockR)[shift + itu4 * (threadIdx.x % 32) + i] = block[itu4 * (threadIdx.x % 32) + i + 64 * ref_block_offset];
+
+	for (int i = 0; i<itu4; i++)
+		((uint4*)blockR)[shift + itu4 * (threadIdx.x % 32) + i] ^= block[itu4 * (threadIdx.x % 32) + i + 64 * prev_block_offset];
+
+	for (int i = 0; i<itu4; i++)
+		((uint4*)block_tmp)[shift + itu4 * (threadIdx.x % 32) + i] = ((uint4*)blockR)[shift + itu4 * (threadIdx.x % 32) + i];
+
+	__syncthreads();
+
+
+	for (int i = 0; i<itu4; i++) {
+		if ((shift + itu4 * (threadIdx.x % 32) + i) == 7)
+			((uint4*)blockR)[shift + itu4 * (threadIdx.x % 32) + i] = IndexContainer;
+		if ((shift + itu4 * (threadIdx.x % 32) + i) == 8)
+			((uint4*)blockR)[shift + itu4 * (threadIdx.x % 32) + i] = ((uint4*)pData)[0];
+		if ((shift + itu4 * (threadIdx.x % 32) + i) == 9)
+			((uint4*)blockR)[shift + itu4 * (threadIdx.x % 32) + i] = ((uint4*)pData)[1];
+	}
+	//if (thread==0)
+	//		printf("GPU pdat %08x %08x %08x %08x\n",pData[0],pData[1],pData[2],pData[3]);
+
+	//
+	__syncthreads();
+
+#define G(a,b,c,d) \
+   { \
+     a = fBlaMka(a,b); \
+     d = eorswap32(d ,a); \
+     c = fBlaMka(c,d); \
+     b = ROR2(b ^ c, 24); \
+     a = fBlaMka(a,b); \
+     d = ROR16(d ^ a); \
+     c = fBlaMka(c,d); \
+     b = ROR2(b ^ c, 63); \
+  } 
+
+#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,   \
+                           v12, v13, v14, v15)  \
+  { \
+    G(v0,v4,v8,v12); \
+    G(v1,v5,v9,v13); \
+    G(v2,v6,v10,v14); \
+    G(v3,v7,v11,v15); \
+    G(v0,v5,v10,v15); \
+    G(v1,v6,v11,v12); \
+    G(v2,v7,v8,v13); \
+    G(v3,v4,v9,v14); \
+  }
+
+	/* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then
+	(16,17,..31)... finally (112,113,...127) */
+	if ((threadIdx.x % 32) <8) {
+
+		//		for (int i = 0; i < 8; i++) {
+		{
+			int i = threadIdx.x % 32;
+			BLAKE2_ROUND_NOMSG(
+				blockR[shift2 + 16 * i], blockR[shift2 + 16 * i + 1], blockR[shift2 + 16 * i + 2],
+				blockR[shift2 + 16 * i + 3], blockR[shift2 + 16 * i + 4], blockR[shift2 + 16 * i + 5],
+				blockR[shift2 + 16 * i + 6], blockR[shift2 + 16 * i + 7], blockR[shift2 + 16 * i + 8],
+				blockR[shift2 + 16 * i + 9], blockR[shift2 + 16 * i + 10], blockR[shift2 + 16 * i + 11],
+				blockR[shift2 + 16 * i + 12], blockR[shift2 + 16 * i + 13], blockR[shift2 + 16 * i + 14],
+				blockR[shift2 + 16 * i + 15]);
+		}
+
+		/* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then
+		(2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */
+
+
+		__syncthreads();
+		{
+			//	for (int i = 0; i < 8; i++) {
+			int i = threadIdx.x % 32;
+			BLAKE2_ROUND_NOMSG(
+				blockR[shift2 + 2 * i], blockR[shift2 + 2 * i + 1], blockR[shift2 + 2 * i + 16],
+				blockR[shift2 + 2 * i + 17], blockR[shift2 + 2 * i + 32], blockR[shift2 + 2 * i + 33],
+				blockR[shift2 + 2 * i + 48], blockR[shift2 + 2 * i + 49], blockR[shift2 + 2 * i + 64],
+				blockR[shift2 + 2 * i + 65], blockR[shift2 + 2 * i + 80], blockR[shift2 + 2 * i + 81],
+				blockR[shift2 + 2 * i + 96], blockR[shift2 + 2 * i + 97], blockR[shift2 + 2 * i + 112],
+				blockR[shift2 + 2 * i + 113]);
+		}
+	}
+
+	__syncthreads();
+	for (int i = 0; i<itu2; i++)
+		block_tmp[shift2 + itu2 * (threadIdx.x % 32) + i] ^= blockR[shift2 + itu2 * (threadIdx.x % 32) + i];
+
+
+
+	for (int i = 0; i<itu4; i++)
+		output[itu4 * (threadIdx.x % 32) + i] = ((uint4*)block_tmp)[shift + itu4 * (threadIdx.x % 32) + i];
+
+	__syncthreads();
+#undef G 
+#undef BLAKE2_ROUND_NOMSG
+}
+
 
 
 __global__
@@ -888,7 +1003,7 @@ void mtp_yloop(uint32_t threads, uint32_t startNounce, const uint4  * __restrict
 
 			uint32_t ref_block  = history.y;
 			uint32_t prev_block = history.x;
-			fill_block4_doubleshared_mtp_output_bhdr(X_IJ, GBlock, prev_block, ref_block);
+			fill_block4_doubleshared_mtp_output_bhdr_idx(X_IJ, GBlock, prev_block, ref_block);
 
 			int countIndex;
 			for (countIndex = threadIdx.x%32; countIndex < 128; countIndex+=32) {

diff --git a/cuda_mtp/mtp.cu b/cuda_mtp/mtp.cu
@@ -1,9 +1,5 @@
 
-/*
-extern "C" {
 
-}
-*/
 #include "argon2ref/argon2.h"
 #include "merkletree/mtp.h"
 
@@ -85,7 +81,7 @@ extern "C" int scanhash_mtp(int thr_id, struct work* work, uint32_t max_nonce, u
 		be32enc(&endiandata[k], pdata[k]);
 
 	((uint32_t*)pdata)[19]=0; //start fresh
-	TheNonce = ((uint32_t*)pdata)[19];
+	TheNonce = ((uint32_t*)pdata)[19]; 
 
 	argon2_context context = init_argon2d_param((const char*)pdata);
 	argon2_instance_t instance;