#include #include __global__ void hello_cuda() { printf("Hello CUDA from thread %d\n", threadIdx.x); } extern "C" void launch_hello_cuda() { // First check device properties cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 1); printf("Using device: %s with compute capability %d.%d\n", prop.name, prop.major, prop.minor); hello_cuda<<<1, 10>>>(); cudaDeviceSynchronize(); fflush(stdout); } extern "C" void check_cuda() { int deviceCount = 0; cudaError_t error = cudaGetDeviceCount(&deviceCount); if (error != cudaSuccess) { printf("CUDA error: %s\n", cudaGetErrorString(error)); } printf("Found %d CUDA devices\n", deviceCount); for (int i = 0; i < deviceCount; i++) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, i); printf("Device %d: %s\n", i, prop.name); printf(" Compute capability: %d.%d\n", prop.major, prop.minor); printf(" Total global memory: %.2f GB\n", static_cast(prop.totalGlobalMem) / (1024 * 1024 * 1024)); printf(" Multiprocessors: %d\n", prop.multiProcessorCount); printf(" Max threads per block: %d\n", prop.maxThreadsPerBlock); printf(" Max threads dimensions: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf(" Max grid dimensions: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("\n"); } }