Stage one of the project, done

2025-09-29 22:40:17 -04:00
parent a3f14b18dc
commit c719c5873f
11 changed files with 529 additions and 0 deletions
--- a/scripts/test_gpu.py
+++ b/scripts/test_gpu.py
@@ -0,0 +1,81 @@
+"""
+GPU/CUDA Verification Script for Lyra
+Tests PyTorch CUDA functionality and reports GPU capabilities
+"""
+
+import torch
+
+def test_cuda():
+    print("=" * 60)
+    print("CUDA/GPU Verification for Lyra")
+    print("=" * 60)
+
+    # Basic CUDA info
+    print(f"\n1. PyTorch Version: {torch.__version__}")
+    print(f"2. CUDA Available: {torch.cuda.is_available()}")
+
+    if not torch.cuda.is_available():
+        print("\n[ERROR] CUDA is not available!")
+        return False
+
+    print(f"3. CUDA Version: {torch.version.cuda}")
+    print(f"4. cuDNN Version: {torch.backends.cudnn.version()}")
+    print(f"5. Number of GPUs: {torch.cuda.device_count()}")
+
+    # GPU Details
+    for i in range(torch.cuda.device_count()):
+        print(f"\n--- GPU {i} ---")
+        print(f"Name: {torch.cuda.get_device_name(i)}")
+        props = torch.cuda.get_device_properties(i)
+        print(f"Compute Capability: {props.major}.{props.minor}")
+        print(f"Total Memory: {props.total_memory / 1024**3:.2f} GB")
+        print(f"Multi-Processors: {props.multi_processor_count}")
+
+    # Memory test
+    print(f"\n--- Memory Status ---")
+    print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
+    print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
+    print(f"Free: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1024**3:.2f} GB")
+
+    # Tensor operations test
+    print(f"\n--- Testing Tensor Operations ---")
+    try:
+        # Create tensors
+        x = torch.randn(1000, 1000, device='cuda')
+        y = torch.randn(1000, 1000, device='cuda')
+
+        # Matrix multiplication
+        z = torch.matmul(x, y)
+
+        print(f"[OK] Matrix multiplication: {z.shape}")
+        print(f"[OK] Tensor device: {z.device}")
+        print(f"[OK] Tensor dtype: {z.dtype}")
+
+        # Test FP16
+        x_fp16 = x.half()
+        y_fp16 = y.half()
+        z_fp16 = torch.matmul(x_fp16, y_fp16)
+        print(f"[OK] FP16 operations: {z_fp16.dtype}")
+
+        # Test BF16
+        if torch.cuda.is_bf16_supported():
+            x_bf16 = x.bfloat16()
+            y_bf16 = y.bfloat16()
+            z_bf16 = torch.matmul(x_bf16, y_bf16)
+            print(f"[OK] BF16 operations: {z_bf16.dtype}")
+        else:
+            print(f"[WARNING] BF16 not supported")
+
+        del x, y, z, x_fp16, y_fp16, z_fp16
+        torch.cuda.empty_cache()
+
+        print(f"\n[SUCCESS] All GPU tests passed!")
+        return True
+
+    except Exception as e:
+        print(f"\n[ERROR] GPU test failed: {e}")
+        return False
+
+if __name__ == "__main__":
+    success = test_cuda()
+    exit(0 if success else 1)