-
Notifications
You must be signed in to change notification settings - Fork 1
/
sp.cu
444 lines (384 loc) · 12 KB
/
sp.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#include <iostream>
#include <cmath>
#include <fstream>
#include <string>
#include <sstream>
#include <ctime>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#define max_neigh 4 //maximum number of neighbors for each node
#define eps 1e-6 //epsilon for tolerance
#define threads_pb 512 //threads per block
#define max_itrs 500 //maximum number of iterations
using namespace std;
class node;
class svf;
/*
* Single Variable Function:
* This is a function of one variable and used throughout the code to
* indicate marginals and messages between nodes which are both single
* variable. In this case the variables have two states.
*
* __host__ __device__:
* Every function with this prefix can be called both from the cpu
* (host) and gpu (device).
*
*/
class svf
{
public:
double x[2]; //two states
__host__ __device__ svf() {
}
__host__ __device__ svf(double b0, double b1){
x[0]=b0;
x[1]=b1;
}
__host__ __device__ svf(const svf &obj){ //this is a copy constructor needed when copying the objects (svf) into the device
for(int i=0; i<2; i++){
x[i]=obj.x[i];
}
}
__host__ __device__ void normalize() { //this is functions which normalizes (sum equal to one) an svf
double t = 0;
for(int i=0; i<2; i++){
t += x[i];
}
for(int i=0; i<2; i++){
x[i] /= t;
}
}
void print() {
cout << '(' << x[0] << ',' << x[1] << ") ";
}
};
__host__ __device__ svf mult(svf a, svf b) { //this funtions multiplies two svfs by element
svf s;
for(int i=0; i<2; i++) {
s.x[i] = a.x[i] * b.x[i];
}
return s;
}
__host__ __device__ svf add(svf a, svf b) { //this functions adds two svfs by element
svf s;
for(int i=0; i<2; i++) {
s.x[i] = a.x[i] + b.x[i];
}
return s;
}
__host__ __device__ double compare_marginal(svf a, svf b){ //this function returns distance between two svfs
a.normalize();
b.normalize();
return (double)abs(a.x[0] - b.x[0]) + abs(a.x[1] - b.x[1]);
}
/*
* Node:
* This class represents each node of the factor graph, variables and
* factors are indicated by a boolean.
*/
class node
{
public:
bool is_var; //this bolean idicates if the node is a variable (or a factor)
svf prv_marginal; //this is the saved previous marginal of it
int my_ind; //this is its index
int num_neighbors; //this is its number of neighbors
int neighbors[max_neigh]; //this array contains its neighbors' indexes
svf msgs[max_neigh]; //this array contains last recieved messages from its neighbors
__host__ __device__ node() {
num_neighbors = 0;
prv_marginal = svf(double(1),double(1)); //initial marginal
}
__host__ __device__ node(const node &obj) { //this is a copy constructor needed when copying the objects (nodes) into the device
is_var = obj.is_var;
prv_marginal = obj.prv_marginal;
my_ind = obj.my_ind;
num_neighbors = obj.num_neighbors;
for(int i=0; i<max_neigh; i++){
neighbors[i] = obj.neighbors[i];
msgs[i] = obj.msgs[i];
}
}
__host__ __device__ svf marginal() { //this function calculates the node's marignal
svf tmp = svf(1,1); //unit function
for(int i=0; i<num_neighbors; i++) {
tmp = mult(tmp, msgs[i]);
}
return tmp;
}
__host__ __device__ bool check_marginal(){ //this functions checks if the marginal has converged
double diff = compare_marginal(marginal(), prv_marginal);
prv_marginal = marginal();
return (diff < eps);
}
__host__ __device__ void send_all(node *rec) { //this function calls the send function for every neighbor
for(int i=0; i<num_neighbors; i++) {
send(i, rec);
}
}
__host__ __device__ void send_init(node *f) { //this function (called by variables) sends initial beliefs (unit) to all neighbors
for(int i=0; i<num_neighbors; i++) {
f[neighbors[i]].get(my_ind, svf(double(1),double(1))); //initial marginal
}
}
__host__ __device__ void send(int k, node *rec) { //this functions sends a new message to the k'th neighbor
if(is_var) { //if this node (sender) is a variable
svf tmp = svf(1,1); //unit funtion
if(num_neighbors==1){ //if the sender has only one neighbor (variable leaf in a tree) send initial marginal
tmp = svf(double(1),double(1)); //initial marginal
}
else{ //otherwise send multiply of all incoming messages except the reciever
for (int i=0; i<num_neighbors; i++) {
if(i != k) {
tmp = mult(tmp, msgs[i]);
}
}
}
rec[neighbors[k]].get(my_ind, tmp); //call the get function of the k'th neighbor
} else { //if this node is a factor
svf tmp = svf(0,0); //zero function
for (int t=0; t<(1<<(num_neighbors-1)); t++) { //for all possible combinations of the variables in the factor except the reciever
bool vars[max_neigh];
int c=0;
for(int i=0; i<num_neighbors; i++) { //first calculate the i'th digit of the combinations, where i is the index of the neighbor
if(i!=k) {
vars[i] = (bool) (t & (1<<c));
c++;
} else {
vars[i] = 0;
}
}
bool nd=1;
double b=1;
for(int i=0; i<num_neighbors; i++) { //then calculate the message result with that combination
if(i!=k) {
b = b * msgs[i].x[vars[i]]; //multiply the messages
nd = nd & vars[i]; //AND function
}
}
svf s(b, b * !nd); //AND to NAND
tmp = add(tmp, s); //sum the results to send
}
rec[neighbors[k]].get(my_ind, tmp); //calling the get function on of the k'th neighbor
}
}
__host__ __device__ void get(int n, svf m) { //this function recieves the message m from node n and saves it (normalized)
int ind;
for (int i=0; i<num_neighbors; i++) {
if(neighbors[i] == n) {
ind = i;
break;
}
}
m.normalize(); //normalize each at stage
msgs[ind] = m;
}
};
/*
* GPU Kernel:
* This function runs the sum-product algorithm in the gpu. __shared__
* variables are shared in a block of threads. All of the calculations
* are in a single block. After each operations (of the threads:k) there
* is a __syncthreads() function which makes sure untill all threads in
* a block have finished their work. The algorithm is explained in the
* cpu section.
*/
__global__ void gpu_send_all(node *factors, node *variables, int num_f, int num_v){
__shared__ int iterations;
__shared__ int t[threads_pb];
__shared__ int temp;
unsigned int k = threadIdx.x;
iterations = 0;
if(k < num_v){
variables[k].send_init(factors);
}
__syncthreads();
t[k] = 0;
__syncthreads();
while(iterations < max_itrs){
temp = 1;
if(k<num_f){
factors[k].send_all(variables);
}
__syncthreads();
if(k<num_v){
variables[k].send_all(factors);
}
__syncthreads();
if(k<num_v){
t[k] = variables[k].check_marginal();
}
__syncthreads();
if(k<num_v){
atomicAnd(&temp, t[k]); //atomicAnd performs the function 'and' on all the blocks on a thread andreduces them to one variable using atomic functions
}
__syncthreads();
if(temp==1) break;
iterations++;
}
}
bool bit(long long n, int i) //this functions returns the i'th bit of the word n
{
return (bool) (n & (1<<i));
}
/*
* Brute Force:
* This function calculates the marignals of the variables in a factor
* graph using brute force method which is checking all the possible
* combinations of the variables in the graph and calculating the
* possibility of each state of each variable regarding the functions.
*/
void bfmarginal(node *var, node *fac, int varn, int facn, svf *bfmarginals){
long long valid_states = 0;
long long *num_ones;
num_ones = new long long[varn];
for(int i=0; i<varn; i++){
num_ones[i] = 0;
}
for(long long j=0; j<(1<<varn); j++){ //for all possible combination of the variables
bool g_state=1;
for(int k=0; k<facn; k++){ //first calculate the results of the factors on that combination
bool f_state=1;
for(int i=0; i<fac[k].num_neighbors; i++){
f_state&=bit(j,fac[k].neighbors[i]);
}
g_state&=(~f_state);
}
if(g_state==1){ //then check if the result is valid and count the ones and zeros
valid_states++;
for(int i=0; i<varn; i++){
num_ones[i]+=bit(j,i);
}
}
}
for(int j=0; j<varn; j++){ //finally devide the number of ones/zeros by all valid states
double x1 = (double)num_ones[j]/valid_states;
double x0 = (double)(valid_states-num_ones[j])/valid_states;
bfmarginals[j] = svf(x0,x1);
}
delete[] num_ones;
}
int main()
{
int vn = 0;
int fn = 0;
string line;
ifstream infile("lattice.txt"); //get factor graph from file
getline(infile, line);
istringstream iss(line);
iss >> vn >> fn; //first get the number of variables and factors
node *v;
node *f;
v = new node[vn];
f = new node[fn];
for(int i=0; i<vn; i++){ //initilize my_ind and is_var
v[i].my_ind = i;
v[i].is_var = 1;
}
for(int i=0; i<fn; i++){
f[i].my_ind = i;
f[i].is_var = 0;
}
for(int i=0; i<fn; i++){ //then read the number of neighbors of each factor and their indexes
getline(infile, line);
istringstream iss(line);
int k = 0;
iss >> k; //number of neighbors of factor i
for (int j=0; j<k; j++){
int t = 0;
iss >> t; //index of neighbors of factor i
f[i].neighbors[j] = t;
v[t].neighbors[v[t].num_neighbors] = i; //do the same for the variables
v[t].num_neighbors++; //increase number of variables' neighors
}
f[i].num_neighbors = k;
}
infile.close();
//copying the raw (variable & factor) nodes to use in cpu section
node *vc;
node *fc;
vc = new node[vn];
fc = new node[fn];
for(int i=0; i<vn; i++){
vc[i] = v[i];
}
for(int i=0; i<fn; i++){
fc[i] = f[i];
}
/*GPU Section*/
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0); //start gpu timer
node *fg;
node *vg;
const size_t szf = size_t(fn) * sizeof(node); //size of factor and variable arrays
const size_t szv = size_t(vn) * sizeof(node);
cudaMalloc((void**)&fg, szf); //allocate space for this arrays on the device
cudaMalloc((void**)&vg, szv);
cudaMemcpy(fg, f, szf, cudaMemcpyHostToDevice); //copy the arrays into the device
cudaMemcpy(vg, v, szv, cudaMemcpyHostToDevice);
gpu_send_all <<<1, threads_pb>>>(fg, vg, fn, vn); //call the gpu kernel with one block and 512 threads per block
cudaThreadSynchronize(); //wait for all the threads to finish their work
cudaMemcpy(f, fg, szf, cudaMemcpyDeviceToHost); //copy back the information
cudaMemcpy(v, vg, szv, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0); //stop gpu timer
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaFree(vg); //free device memories
cudaFree(fg);
cudaEventDestroy(start);
cudaEventDestroy(stop);
/*CPU Section*/
clock_t startCPU, stopCPU;
startCPU = clock(); //start cpu timer
for(int i=0; i<vn; i++){
vc[i].send_init(fc); //send initial beliefs (marginals) from variables to factors
}
int iterations = 0; //iterator
while(iterations < max_itrs){ //while less than maximum allowed iterations
bool t = 1;
for(int i=0; i<fn; i++){ //send messages from all factors to all variables
fc[i].send_all(vc);
}
for(int i=0; i<vn; i++){ //send messages from all variables to all factors
vc[i].send_all(fc);
}
for(int i=0; i<vn; i++){
t &= vc[i].check_marginal(); //check the mariginals for the variables for convergency
}
if(t==1) break;
iterations++;
}
stopCPU = clock(); //stop cpu timer
/*Brute Force Section*/
clock_t startBF, stopBF;
startBF = clock(); //start brute force timer
svf *bfmarginals;
bfmarginals = new svf[vn];
bfmarginal(v, f, vn, fn, bfmarginals); //call brute force method
stopBF = clock(); //stop brute force timer
//print the results
cout << "Using " << iterations << " Iterations and " << eps << " Tolerance:" << endl;
cout << "v[x]:(GPU) (CPU) (BrF)" << endl;
cout << "Time:(" << elapsedTime << " ms) (" << (stopCPU-startCPU) << " ms) (" << (stopBF-startBF) << " ms)" << endl;
for(int i=0; i<vn; i++) {
svf s = v[i].marginal();
s.normalize();
cout << "v[" << i << "]:";
s.print();
s = vc[i].marginal();
s.normalize();
s.print();
bfmarginals[i].print();
cout << endl;
}
delete[] vc; //free memories allocated
delete[] fc;
delete[] v;
delete[] f;
delete[] bfmarginals;
return 0;
}