Current version (BigInteger / BigDouble / BOperation): 6.0 / 1.1 / 1.23
addfunction)
subfunction)
mulfunction)
dvsfunction)
bipowfunction)
nqrtfunction)
modfunction)
iniStrfunction)
toStringfunction)
BImemcpyfunction)
equalsfunction)
newBIfunction)
-Doption on most of compilers).
-D BI_STANDALONE=1; in any other case, it will allow to use both Integer and Double.
-D CUDA_ENABLED=1 -D BI_STANDALONE=1; in any other case, it can only be used under C/C++ regular code.
-D C_MAX_LENGTH=n. Default value is 4096, so you can work with numbers up to 4096 digits. When
C_MAX_LENGTHis defined, BigIntegers can hold
ndigits
-D BI_SERVICE=1. When done, you can access
getReturnCode()function to know the execution status (being a return of 0 an OK status).
-D CVALIDATE=0 BI_STANDALONE=1. With this option, some validations will skip, and performance will boost a bit.
.cufile to work:
BigInteger.cufile and click on Properties (alternatively, select the item and click
Alt + Intro
/TCoption on the "Compile as" option
(-rdc=true)option on "Generate Relocatable Device Code" option:
<<<64, 64>>>cluster configuration).
C_MAX_LENGTH=4096) is only 4104 bytes length (exact formula is
C_MAX_LENGTH + sizeof(char) + sizeof(int)); and memory object it's only 160 bytes.
Calculation - Loop Start @1392; Loop End @5698 {819200 values checked on 4306 clocks} ~ 190246.171875 calcs / sec
Calculation - Loop Start @5698; Loop End @6597 {819200 values checked on 899 clocks} ~ 911234.687500 calcs / sec --> 20.88%
Calculation - Loop Start @1436; Loop End @12294 {1638400 values checked on 10858 clocks} ~ 150893.359375 calcs / sec
Calculation - Loop Start @12294; Loop End @14441 {1638400 values checked on 2147 clocks} ~ 763111.312500 calcs / sec --> 19.77%
Calculation - Loop Start @1290; Loop End @21513 {2457600 values checked on 20223 clocks} ~ 121524.992188 calcs / sec
Calculation - Loop Start @21513; Loop End @25019 {2457600 values checked on 3506 clocks} ~ 700969.812500 calcs / sec --> 17.34%
#include "stdio.h"
#include "stdlib.h"
#include "conio.h"
#include "BigInteger.h"
#include "time.h"
#include "string.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
void t_proc();
__global__ void kernel(void* a, void* b, void* x, void* y, int* i, char* c, int n);
int main() {
t_proc();
return 0;
}
void t_proc() {
//dimension & control
int k = 8192; //number of items
int perf = 100; //performance indicator
int print = 0; //toggle to 1 to verbose print (performance drain!)
int y;
int z;
int dx = 64; //CUDA config
int dy = 64; //CUDA config
float freq;
float str;
float end;
//Int Data
int* a = NULL;
int* b = NULL;
//Host Data
BigInteger* hostBIa = NULL;
BigInteger* hostBIb = NULL;
int* hostInt = NULL;
char* hostOp = NULL;
//Device Data
BigInteger* deviceBIa = NULL;
BigInteger* deviceBIb = NULL;
int* deviceInt = NULL;
char* deviceOp = NULL;
BigInteger* deviceX = NULL;
BigInteger* deviceY = NULL;
//printing & aux vars
memory* hostM = (memory*)malloc(getMemorySize());
char* st1;
char* st2;
char* st3;
int t;
char bis[6];
//start the engine
init((void**)hostM);
iniStr(&st1);
iniStr(&st2);
iniStr(&st3);
srand(time(NULL));
if (print == 1)
printf("y\tva\top\tvb\tret\tres\n");
//1. Allocate host memory (k times)
hostBIa = (BigInteger*)malloc(sizeof(BigInteger) * k);
hostBIb = (BigInteger*)malloc(sizeof(BigInteger) * k);
hostInt = (int*)malloc(sizeof(int) * k);
hostOp = (char*)malloc(sizeof(char) * k);
a = (int*)malloc(sizeof(int) * k);
b = (int*)malloc(sizeof(int) * k);
//2. Give values
//2.1. va
for (y = 0; y < k; y++) {
t = rand() % 10000;
_itoa_s(t, bis, 6, 10);
newBI(&hostBIa[y], bis, 0);
a[y] = t;
}
//2.2. vb
for (y = 0; y < k; y++) {
t = rand() % 10000;
_itoa_s(t, bis, 6, 10);
newBI(&hostBIb[y], bis, 0);
b[y] = t;
}
//2.3. ret
for (y = 0; y < k; y++)
hostInt[y] = rand() % 10;
//2.4. op
for (y = 0; y < k; y++) {
t = rand() % 4;
switch (t) {
case 0:
hostOp[y] = '+';
break;
case 1:
hostOp[y] = '-';
break;
case 2:
hostOp[y] = '*';
break;
case 3:
hostOp[y] = '/';
break;
}
}
//2.5. Move it to Device
h2d((void**)&deviceBIa, hostBIa, k, sizeof(BigInteger));
h2d((void**)&deviceBIb, hostBIb, k, sizeof(BigInteger));
h2d((void**)&deviceX, hostBIb, k, sizeof(BigInteger));
h2d((void**)&deviceY, hostBIb, k, sizeof(BigInteger));
h2d((void**)&deviceInt, hostInt, k, sizeof(int));
h2d((void**)&deviceOp, hostOp, k, sizeof(char));
//3. Trace information
if (print == 1) {
for (y = 0; y < k; y++) {
toString(&hostBIa[y], st1);
toString(&hostBIb[y], st2);
printf("%i\t%s\t%c\t%s\t%i\n", y, st1, hostOp[y], st2, hostInt[y]);
}
}
//4. Calculation
//4.1. Int calculation
str = clock();
for (z = 0; z < perf; z++) {
for (y = 0; y < k; y++) {
switch (hostOp[y]) {
case '+':
a[y] += b[y];
break;
case '-':
a[y] -= b[y];
break;
case '*':
a[y] *= b[y];
break;
case'/':
a[y] /= b[y];
break;
}
}
}
end = clock();
if (print == 1) {
for (y = 0; y < k; y++) {
printf("\t\t\t\t\t%i\n", a[y]);
}
}
freq = ((k * perf) / (end - str)) * CLOCKS_PER_SEC;
printf("Calculation - Loop Start @%i; Loop End @%i {%i values checked on %i clocks} ~ %f calcs / sec\n", (int)str, (int)end, k * perf, (int)(end - str), freq);
//4.2. Host Calculation
str = clock();
for (z = 0; z < perf; z++) {
for (y = 0; y < k; y++) {
switch (hostOp[y]) {
case '+':
add(&hostBIa[y], &hostBIb[y], hostM);
break;
case '-':
sub(&hostBIa[y], &hostBIb[y], hostM);
break;
case '*':
mul(&hostBIa[y], &hostBIb[y], hostM);
break;
case'/':
dvs(&hostBIa[y], &hostBIb[y], hostM);
break;
case 's':
nqrt(&hostBIa[y], hostInt[y], hostM);
break;
case '^':
bipow(&hostBIa[y], hostInt[y], hostM);
break;
case '?':
equals(&hostBIa[y], &hostBIb[y], &hostInt[y]);
break;
}
}
}
end = clock();
if (print == 1) {
//separator
printf("\n");
for (y = 0; y < k; y++) {
toString(&hostBIa[y], st1);
printf("\t\t\t\t\t%s\n", st1);
}
}
freq = ((k * perf) / (end - str)) * CLOCKS_PER_SEC;
printf("Calculation - Loop Start @%i; Loop End @%i {%i values checked on %i clocks} ~ %f calcs / sec\n", (int)str, (int)end, k * perf, (int)(end - str), freq);
//4.3. Device calculation
str = clock();
for (z = 0; z < perf; z++) {
kernel <<<dx, dy>>>(deviceBIa, deviceBIb, deviceX, deviceY, deviceInt, deviceOp, k);
cudaDeviceSynchronize();
}
end = clock();
//move data back
d2h(hostBIa, &deviceBIa, k, sizeof(BigInteger));
//print result
if (print == 1) {
//separator
printf("\n");
for (y = 0; y < k; y++) {
toString(&hostBIa[y], st1);
printf("\t\t\t\t\t%s\n", st1);
}
}
freq = ((k * perf) / (end - str)) * CLOCKS_PER_SEC;
printf("Calculation - Loop Start @%i; Loop End @%i {%i values checked on %i clocks} ~ %f calcs / sec\n", (int)str, (int)end, k * perf, (int)(end - str), freq);
//8. Finish
return;
}
// multi multi multi multi multi multi
__global__ void kernel(void* a, void* b, void* x, void* y, int* i, char* c, int n) {
int ind;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int inc = blockDim.x * gridDim.x;
for (ind = idx; ind < n; ind += inc) {
switch (c[ind]) {
case '+':
CUpAdd(&((BigInteger*)a)[ind],
&((BigInteger*)b)[ind]);
break;
case '-':
CUsub(&((BigInteger*)a)[ind],
&((BigInteger*)b)[ind],
&((BigInteger*)x)[ind]);
break;
case '*':
CUsMul(&((BigInteger*)a)[ind],
&((BigInteger*)b)[ind],
&((BigInteger*)x)[ind],
&((BigInteger*)y)[ind]);
break;
case'/':
CUsDvs(&((BigInteger*)a)[ind],
&((BigInteger*)b)[ind],
&((BigInteger*)x)[ind],
&((BigInteger*)y)[ind]);
break;
}
}
}