diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2bebc5..d2b2491 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.9)
 set(NAME "Vec3")
 project(${NAME})
 
+# Check for CUDA
+include(CheckLanguage)
+check_language(CUDA)
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 add_compile_options(-Wall -Wextra -Wpedantic)
diff --git a/README.md b/README.md
index 9cfb835..b68a789 100644
--- a/README.md
+++ b/README.md
@@ -21,4 +21,5 @@ endif()
 ## Features
 
 * Guards all testing code to only be run when Vec3 is the main project
+* Compatible with both CUDA and C++ 
 
diff --git a/include/vec3.h b/include/vec3.h
index 6e37d1e..54dcf4e 100644
--- a/include/vec3.h
+++ b/include/vec3.h
@@ -3,35 +3,44 @@
 
 #include <cmath>
 #include <limits>
+
+#ifdef __CUDACC__
+#define CUDA_CALLABLE __host__ __device__
+#else
+#define CUDA_CALLABLE
+#endif
+
 template <typename T> struct Vec3 {
   T x;
   T y;
   T z;
 
-  inline Vec3<T> operator+(Vec3<T> other) const {
+  CUDA_CALLABLE inline Vec3<T> operator+(Vec3<T> other) const {
     return {x + other.x, y + other.y, z + other.z};
   };
 
-  inline Vec3<T> operator-(Vec3<T> other) const {
+  CUDA_CALLABLE inline Vec3<T> operator-(Vec3<T> other) const {
     return {x - other.x, y - other.y, z - other.z};
   };
 
-  inline Vec3 scale(T scalar) { return {x * scalar, y * scalar, z * scalar}; };
+  CUDA_CALLABLE inline Vec3 scale(T scalar) {
+    return {x * scalar, y * scalar, z * scalar};
+  };
 
-  inline T dot(Vec3<T> other) const {
+  CUDA_CALLABLE inline T dot(Vec3<T> other) const {
     return x * other.x + y * other.y + z * other.z;
   }
 
-  inline Vec3<T> cross(Vec3<T> other) const {
+  CUDA_CALLABLE inline Vec3<T> cross(Vec3<T> other) const {
     return {y * other.z - z * other.y, z * other.x - x * other.z,
             x * other.y - y * other.x};
   }
 
-  inline T squared_norm2() const { return x * x + y * y + z * z; }
+  CUDA_CALLABLE inline T squared_norm2() const { return x * x + y * y + z * z; }
 
-  inline T norm2() const { return std::sqrt(squared_norm2()); }
+  CUDA_CALLABLE inline T norm2() const { return std::sqrt(squared_norm2()); }
 
-  inline Vec3<T> normalized() {
+  CUDA_CALLABLE inline Vec3<T> normalized() {
     // Add epsilon to the norm for stability when the norm is 0
     T norm = std::max(norm2(), std::numeric_limits<T>::epsilon());
     return {x / norm, y / norm, z / norm};
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 85a8157..ae6b103 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,4 +10,16 @@ if(NOT EXISTS ${GOOGLETEST_DIR})
 endif()
 
 add_subdirectory(lib/googletest)
-add_subdirectory(unit_tests)
\ No newline at end of file
+add_subdirectory(unit_tests)
+
+# Only run Cuda tests if cuda is available
+if (CMAKE_CUDA_COMPILER)
+    set(CMAKE_CUDA_ARCHITECTURES 61)
+    set(CUDA_SEPARABLE_COMPILATION ON)
+
+    add_subdirectory(cuda_unit_tests)
+    message(STATUS "CUDA found. CUDA tests will be build")
+else()
+    message(STATUS "CUDA not found. Skipping CUDA tests")
+endif()
+
diff --git a/tests/cuda_unit_tests/CMakeLists.txt b/tests/cuda_unit_tests/CMakeLists.txt
new file mode 100644
index 0000000..344a8e8
--- /dev/null
+++ b/tests/cuda_unit_tests/CMakeLists.txt
@@ -0,0 +1,10 @@
+include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
+
+add_executable(${NAME}_cuda_tests
+    vec3_test.cu
+)
+
+target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
+target_link_libraries(${NAME}_cuda_tests ${NAME})
+
+add_test(NAME Vec3CudaTests COMMAND ${CMAKE_BINARY_DIR}/tests/cuda_unit_tests/${NAME}_cuda_tests)
diff --git a/tests/cuda_unit_tests/vec3_test.cu b/tests/cuda_unit_tests/vec3_test.cu
new file mode 100644
index 0000000..f4d7b0f
--- /dev/null
+++ b/tests/cuda_unit_tests/vec3_test.cu
@@ -0,0 +1,177 @@
+#include "vec3.h"
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+// Define kernel function to test Vec3 operations
+template <typename T>
+__global__ void testVec3Operations(Vec3<T> *results, Vec3<T> a, Vec3<T> b,
+                                   T scalar) {
+  int idx = threadIdx.x;
+
+  // Test different operations based on thread index
+  switch (idx) {
+  case 0: // Addition
+    results[idx] = a + b;
+    break;
+  case 1: // Subtraction
+    results[idx] = a - b;
+    break;
+  case 2: // Scale
+    results[idx] = a.scale(scalar);
+    break;
+  case 3: // Dot product - store in x component
+    results[idx].x = a.dot(b);
+    results[idx].y = 0;
+    results[idx].z = 0;
+    break;
+  case 4: // Cross product
+    results[idx] = a.cross(b);
+    break;
+  case 5: // Squared norm - store in x component
+    results[idx].x = a.squared_norm2();
+    results[idx].y = 0;
+    results[idx].z = 0;
+    break;
+  case 6: // Norm - store in x component
+    results[idx].x = a.norm2();
+    results[idx].y = 0;
+    results[idx].z = 0;
+    break;
+  case 7: // Normalized
+    results[idx] = a.normalized();
+    break;
+  }
+}
+
+// Test fixture for Vec3 CUDA tests
+class Vec3CudaTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    // Allocate device memory for results
+    cudaMalloc(&d_results, NUM_TESTS * sizeof(Vec3<float>));
+  }
+
+  void TearDown() override {
+    // Free device memory
+    cudaFree(d_results);
+  }
+
+  // Number of operations to test
+  static const int NUM_TESTS = 8;
+
+  // Pointer to device memory for results
+  Vec3<float> *d_results;
+
+  // Host memory for results
+  Vec3<float> h_results[NUM_TESTS];
+
+  // Test with a reasonable epsilon for floating point comparisons
+  float epsilon = 1e-5f;
+};
+
+TEST_F(Vec3CudaTest, BasicOperations) {
+  // Define test vectors
+  Vec3<float> a{1.0f, 2.0f, 3.0f};
+  Vec3<float> b{4.0f, 5.0f, 6.0f};
+  float scalar = 2.0f;
+
+  // Launch kernel with 8 threads to test different operations
+  testVec3Operations<<<1, NUM_TESTS>>>(d_results, a, b, scalar);
+
+  // Check for kernel execution errors
+  cudaError_t cudaStatus = cudaGetLastError();
+  ASSERT_EQ(cudaStatus, cudaSuccess)
+      << "Kernel launch failed: " << cudaGetErrorString(cudaStatus);
+
+  // Copy results back to host
+  cudaStatus = cudaMemcpy(h_results, d_results, NUM_TESTS * sizeof(Vec3<float>),
+                          cudaMemcpyDeviceToHost);
+  ASSERT_EQ(cudaStatus, cudaSuccess)
+      << "cudaMemcpy failed: " << cudaGetErrorString(cudaStatus);
+
+  // Wait for GPU to finish
+  cudaStatus = cudaDeviceSynchronize();
+  ASSERT_EQ(cudaStatus, cudaSuccess)
+      << "cudaDeviceSynchronize failed: " << cudaGetErrorString(cudaStatus);
+
+  // Test addition
+  EXPECT_NEAR(h_results[0].x, 5.0f, epsilon);
+  EXPECT_NEAR(h_results[0].y, 7.0f, epsilon);
+  EXPECT_NEAR(h_results[0].z, 9.0f, epsilon);
+
+  // Test subtraction
+  EXPECT_NEAR(h_results[1].x, -3.0f, epsilon);
+  EXPECT_NEAR(h_results[1].y, -3.0f, epsilon);
+  EXPECT_NEAR(h_results[1].z, -3.0f, epsilon);
+
+  // Test scale
+  EXPECT_NEAR(h_results[2].x, 2.0f, epsilon);
+  EXPECT_NEAR(h_results[2].y, 4.0f, epsilon);
+  EXPECT_NEAR(h_results[2].z, 6.0f, epsilon);
+
+  // Test dot product
+  EXPECT_NEAR(h_results[3].x, 32.0f, epsilon);
+
+  // Test cross product
+  EXPECT_NEAR(h_results[4].x, -3.0f, epsilon);
+  EXPECT_NEAR(h_results[4].y, 6.0f, epsilon);
+  EXPECT_NEAR(h_results[4].z, -3.0f, epsilon);
+
+  // Test squared norm
+  EXPECT_NEAR(h_results[5].x, 14.0f, epsilon);
+
+  // Test norm
+  EXPECT_NEAR(h_results[6].x, std::sqrt(14.0f), epsilon);
+
+  // Test normalized
+  float norm = std::sqrt(14.0f);
+  EXPECT_NEAR(h_results[7].x, 1.0f / norm, epsilon);
+  EXPECT_NEAR(h_results[7].y, 2.0f / norm, epsilon);
+  EXPECT_NEAR(h_results[7].z, 3.0f / norm, epsilon);
+}
+
+TEST_F(Vec3CudaTest, EdgeCases) {
+  // Test with zero vector
+  Vec3<float> zero{0.0f, 0.0f, 0.0f};
+  Vec3<float> nonZero{1.0f, 2.0f, 3.0f};
+  float scalar = 5.0f;
+
+  // Launch kernel with 8 threads to test different operations
+  testVec3Operations<<<1, NUM_TESTS>>>(d_results, zero, nonZero, scalar);
+
+  // Check for kernel execution errors
+  cudaError_t cudaStatus = cudaGetLastError();
+  ASSERT_EQ(cudaStatus, cudaSuccess)
+      << "Kernel launch failed: " << cudaGetErrorString(cudaStatus);
+
+  // Copy results back to host
+  cudaStatus = cudaMemcpy(h_results, d_results, NUM_TESTS * sizeof(Vec3<float>),
+                          cudaMemcpyDeviceToHost);
+  ASSERT_EQ(cudaStatus, cudaSuccess)
+      << "cudaMemcpy failed: " << cudaGetErrorString(cudaStatus);
+
+  // Wait for GPU to finish
+  cudaStatus = cudaDeviceSynchronize();
+  ASSERT_EQ(cudaStatus, cudaSuccess)
+      << "cudaDeviceSynchronize failed: " << cudaGetErrorString(cudaStatus);
+
+  // Test normalized with zero vector (should handle epsilon)
+  // Normalized of zero vector should be very small but not NaN
+  EXPECT_FALSE(isnan(h_results[7].x));
+  EXPECT_FALSE(isnan(h_results[7].y));
+  EXPECT_FALSE(isnan(h_results[7].z));
+
+  // Test dot product with zero vector (should be zero)
+  EXPECT_NEAR(h_results[3].x, 0.0f, epsilon);
+
+  // Test cross product with zero vector (should be zero)
+  EXPECT_NEAR(h_results[4].x, 0.0f, epsilon);
+  EXPECT_NEAR(h_results[4].y, 0.0f, epsilon);
+  EXPECT_NEAR(h_results[4].z, 0.0f, epsilon);
+}
+
+// Main function to run all tests
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt
index 305a7e6..b0bc0d8 100644
--- a/tests/unit_tests/CMakeLists.txt
+++ b/tests/unit_tests/CMakeLists.txt
@@ -5,6 +5,6 @@ add_executable(${NAME}_tests
 )
 
 target_link_libraries(${NAME}_tests gtest gtest_main)
-target_link_libraries(${NAME_tests} ${NAME})
+target_link_libraries(${NAME}_tests ${NAME})
 
 add_test(NAME Vec3Tests COMMAND ${CMAKE_BINARY_DIR}/tests/unit_tests/${NAME}_tests)