/* nvcc MM-GPU-GM.cu -o MM-GPU-GM.out ./MM-GPU-GM.out */ #include "cuda.h" #include #include #include void MatrixMultiplication(float *, float *, float *, int); void verify(float *P, int size){ int i,j; float sum; for(i=0;i>>( Md, Nd, Pd, Width); cudaThreadSynchronize(); // transfer P from device cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost); cudaThreadSynchronize(); gettimeofday(&timer_end, NULL); double timer_spent = timer_end.tv_sec - timer_start.tv_sec + (timer_end.tv_usec - timer_start.tv_usec) / 1000000.0; printf("Time spent: %.6f\n", timer_spent); // free the memory allocated on the GPU cudaFree(Md); cudaFree(Nd); cudaFree(Pd); }