Current version (BigInteger / BigDouble / BOperation): 6.0 / 1.1 / 1.23
addfunction)
subfunction)
mulfunction)
dvsfunction)
bipowfunction)
nqrtfunction)
modfunction)
iniStrfunction)
toStringfunction)
BImemcpyfunction)
equalsfunction)
newBIfunction)
-Doption on most of compilers).
-D BI_STANDALONE=1; in any other case, it will allow to use both Integer and Double.
-D CUDA_ENABLED=1 -D BI_STANDALONE=1; in any other case, it can only be used under C/C++ regular code.
-D C_MAX_LENGTH=n. Default value is 4096, so you can work with numbers up to 4096 digits. When
C_MAX_LENGTHis defined, BigIntegers can hold
ndigits
-D BI_SERVICE=1. When done, you can access
getReturnCode()function to know the execution status (being a return of 0 an OK status).
-D CVALIDATE=0 BI_STANDALONE=1. With this option, some validations will skip, and performance will boost a bit.
.cufile to work:
BigInteger.cufile and click on Properties (alternatively, select the item and click
Alt + Intro
/TCoption on the "Compile as" option
(-rdc=true)option on "Generate Relocatable Device Code" option:
<<<64, 64>>>cluster configuration).
C_MAX_LENGTH=4096) is only 4104 bytes length (exact formula is
C_MAX_LENGTH + sizeof(char) + sizeof(int)); and memory object it's only 160 bytes.
Calculation - Loop Start @1392; Loop End @5698 {819200 values checked on 4306 clocks} ~ 190246.171875 calcs / sec Calculation - Loop Start @5698; Loop End @6597 {819200 values checked on 899 clocks} ~ 911234.687500 calcs / sec --> 20.88% Calculation - Loop Start @1436; Loop End @12294 {1638400 values checked on 10858 clocks} ~ 150893.359375 calcs / sec Calculation - Loop Start @12294; Loop End @14441 {1638400 values checked on 2147 clocks} ~ 763111.312500 calcs / sec --> 19.77% Calculation - Loop Start @1290; Loop End @21513 {2457600 values checked on 20223 clocks} ~ 121524.992188 calcs / sec Calculation - Loop Start @21513; Loop End @25019 {2457600 values checked on 3506 clocks} ~ 700969.812500 calcs / sec --> 17.34%
#include "stdio.h" #include "stdlib.h" #include "conio.h" #include "BigInteger.h" #include "time.h" #include "string.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" void t_proc(); __global__ void kernel(void* a, void* b, void* x, void* y, int* i, char* c, int n); int main() { t_proc(); return 0; } void t_proc() { //dimension & control int k = 8192; //number of items int perf = 100; //performance indicator int print = 0; //toggle to 1 to verbose print (performance drain!) int y; int z; int dx = 64; //CUDA config int dy = 64; //CUDA config float freq; float str; float end; //Int Data int* a = NULL; int* b = NULL; //Host Data BigInteger* hostBIa = NULL; BigInteger* hostBIb = NULL; int* hostInt = NULL; char* hostOp = NULL; //Device Data BigInteger* deviceBIa = NULL; BigInteger* deviceBIb = NULL; int* deviceInt = NULL; char* deviceOp = NULL; BigInteger* deviceX = NULL; BigInteger* deviceY = NULL; //printing & aux vars memory* hostM = (memory*)malloc(getMemorySize()); char* st1; char* st2; char* st3; int t; char bis[6]; //start the engine init((void**)hostM); iniStr(&st1); iniStr(&st2); iniStr(&st3); srand(time(NULL)); if (print == 1) printf("y\tva\top\tvb\tret\tres\n"); //1. Allocate host memory (k times) hostBIa = (BigInteger*)malloc(sizeof(BigInteger) * k); hostBIb = (BigInteger*)malloc(sizeof(BigInteger) * k); hostInt = (int*)malloc(sizeof(int) * k); hostOp = (char*)malloc(sizeof(char) * k); a = (int*)malloc(sizeof(int) * k); b = (int*)malloc(sizeof(int) * k); //2. Give values //2.1. va for (y = 0; y < k; y++) { t = rand() % 10000; _itoa_s(t, bis, 6, 10); newBI(&hostBIa[y], bis, 0); a[y] = t; } //2.2. vb for (y = 0; y < k; y++) { t = rand() % 10000; _itoa_s(t, bis, 6, 10); newBI(&hostBIb[y], bis, 0); b[y] = t; } //2.3. ret for (y = 0; y < k; y++) hostInt[y] = rand() % 10; //2.4. op for (y = 0; y < k; y++) { t = rand() % 4; switch (t) { case 0: hostOp[y] = '+'; break; case 1: hostOp[y] = '-'; break; case 2: hostOp[y] = '*'; break; case 3: hostOp[y] = '/'; break; } } //2.5. Move it to Device h2d((void**)&deviceBIa, hostBIa, k, sizeof(BigInteger)); h2d((void**)&deviceBIb, hostBIb, k, sizeof(BigInteger)); h2d((void**)&deviceX, hostBIb, k, sizeof(BigInteger)); h2d((void**)&deviceY, hostBIb, k, sizeof(BigInteger)); h2d((void**)&deviceInt, hostInt, k, sizeof(int)); h2d((void**)&deviceOp, hostOp, k, sizeof(char)); //3. Trace information if (print == 1) { for (y = 0; y < k; y++) { toString(&hostBIa[y], st1); toString(&hostBIb[y], st2); printf("%i\t%s\t%c\t%s\t%i\n", y, st1, hostOp[y], st2, hostInt[y]); } } //4. Calculation //4.1. Int calculation str = clock(); for (z = 0; z < perf; z++) { for (y = 0; y < k; y++) { switch (hostOp[y]) { case '+': a[y] += b[y]; break; case '-': a[y] -= b[y]; break; case '*': a[y] *= b[y]; break; case'/': a[y] /= b[y]; break; } } } end = clock(); if (print == 1) { for (y = 0; y < k; y++) { printf("\t\t\t\t\t%i\n", a[y]); } } freq = ((k * perf) / (end - str)) * CLOCKS_PER_SEC; printf("Calculation - Loop Start @%i; Loop End @%i {%i values checked on %i clocks} ~ %f calcs / sec\n", (int)str, (int)end, k * perf, (int)(end - str), freq); //4.2. Host Calculation str = clock(); for (z = 0; z < perf; z++) { for (y = 0; y < k; y++) { switch (hostOp[y]) { case '+': add(&hostBIa[y], &hostBIb[y], hostM); break; case '-': sub(&hostBIa[y], &hostBIb[y], hostM); break; case '*': mul(&hostBIa[y], &hostBIb[y], hostM); break; case'/': dvs(&hostBIa[y], &hostBIb[y], hostM); break; case 's': nqrt(&hostBIa[y], hostInt[y], hostM); break; case '^': bipow(&hostBIa[y], hostInt[y], hostM); break; case '?': equals(&hostBIa[y], &hostBIb[y], &hostInt[y]); break; } } } end = clock(); if (print == 1) { //separator printf("\n"); for (y = 0; y < k; y++) { toString(&hostBIa[y], st1); printf("\t\t\t\t\t%s\n", st1); } } freq = ((k * perf) / (end - str)) * CLOCKS_PER_SEC; printf("Calculation - Loop Start @%i; Loop End @%i {%i values checked on %i clocks} ~ %f calcs / sec\n", (int)str, (int)end, k * perf, (int)(end - str), freq); //4.3. Device calculation str = clock(); for (z = 0; z < perf; z++) { kernel <<<dx, dy>>>(deviceBIa, deviceBIb, deviceX, deviceY, deviceInt, deviceOp, k); cudaDeviceSynchronize(); } end = clock(); //move data back d2h(hostBIa, &deviceBIa, k, sizeof(BigInteger)); //print result if (print == 1) { //separator printf("\n"); for (y = 0; y < k; y++) { toString(&hostBIa[y], st1); printf("\t\t\t\t\t%s\n", st1); } } freq = ((k * perf) / (end - str)) * CLOCKS_PER_SEC; printf("Calculation - Loop Start @%i; Loop End @%i {%i values checked on %i clocks} ~ %f calcs / sec\n", (int)str, (int)end, k * perf, (int)(end - str), freq); //8. Finish return; } // multi multi multi multi multi multi __global__ void kernel(void* a, void* b, void* x, void* y, int* i, char* c, int n) { int ind; int idx = blockIdx.x * blockDim.x + threadIdx.x; int inc = blockDim.x * gridDim.x; for (ind = idx; ind < n; ind += inc) { switch (c[ind]) { case '+': CUpAdd(&((BigInteger*)a)[ind], &((BigInteger*)b)[ind]); break; case '-': CUsub(&((BigInteger*)a)[ind], &((BigInteger*)b)[ind], &((BigInteger*)x)[ind]); break; case '*': CUsMul(&((BigInteger*)a)[ind], &((BigInteger*)b)[ind], &((BigInteger*)x)[ind], &((BigInteger*)y)[ind]); break; case'/': CUsDvs(&((BigInteger*)a)[ind], &((BigInteger*)b)[ind], &((BigInteger*)x)[ind], &((BigInteger*)y)[ind]); break; } } }