diff --git a/examples/cornell_box b/examples/cornell_box
index 186b7f3..92b5450 100644
Binary files a/examples/cornell_box and b/examples/cornell_box differ
diff --git a/examples/cornell_box_metal_sphere b/examples/cornell_box_metal_sphere
index 30920a9..b83fcc8 100644
Binary files a/examples/cornell_box_metal_sphere and b/examples/cornell_box_metal_sphere differ
diff --git a/examples/cornell_box_metal_sphere.cpp b/examples/cornell_box_metal_sphere.cpp
index 71dcf50..599cce9 100644
--- a/examples/cornell_box_metal_sphere.cpp
+++ b/examples/cornell_box_metal_sphere.cpp
@@ -299,7 +299,7 @@ void setup_cornell_box() {
 	g_scene->add_mesh(tall_box);
 
 	// Metal sphere (replacing the glass box, positioned on the right side)
-	auto metal_sphere = create_sphere(0.5f, 16, 8, /*metal_id*/white_id);
+	auto metal_sphere = create_sphere(0.5f, 64, 32, /*metal_id*/white_id);
 	metal_sphere->set_position(Vec3(0.55f, -1.5f, 0.35f));
 	metal_sphere->upload_to_gpu();
 	g_scene->add_mesh(metal_sphere);
diff --git a/include/core/bvh.h b/include/core/bvh.h
index 22b231e..ea99343 100644
--- a/include/core/bvh.h
+++ b/include/core/bvh.h
@@ -68,18 +68,23 @@ struct BVHNodeGpu {
 	Vec4 aabb_max_count_; ///< xyz = aabb max, w = count (uint, 0 for internal)
 };
 
-// GPU-friendly triangle layout (std430 aligned)
-struct TriangleGpu {
-	Vec4 v0_material_; ///< xyz = v0, w = material_id (uint)
-	Vec4 v1_; ///< xyz = v1, w = reserved
-	Vec4 v2_; ///< xyz = v2, w = reserved
-	Vec4 n0_; ///< xyz = n0, w = reserved
-	Vec4 n1_; ///< xyz = n1, w = reserved
-	Vec4 n2_; ///< xyz = n2, w = reserved
+// Compact triangle for intersection testing only (48 bytes = 3 x vec4)
+// Precomputes edge vectors to avoid redundant calculation in Moller-Trumbore
+struct TriangleCompactGpu {
+	Vec4 v0_material_; ///< xyz = v0 position, w = material_id (uint)
+	Vec4 e1_; ///< xyz = v1 - v0 (precomputed edge 1)
+	Vec4 e2_; ///< xyz = v2 - v0 (precomputed edge 2)
+};
+
+// Full triangle attributes fetched only after confirmed hit (112 bytes = 7 x vec4)
+struct TriangleAttrGpu {
+	Vec4 n0_; ///< xyz = normal at v0
+	Vec4 n1_; ///< xyz = normal at v1
+	Vec4 n2_; ///< xyz = normal at v2
 	Vec4 uv0_uv1_; ///< xy = uv0, zw = uv1
-	Vec4 uv2_; ///< xy = uv2, zw = reserved
-	Vec4 t0_; ///< xyz = t0 (tangent at v0), w = reserved
-	Vec4 t1_; ///< xyz = t1 (tangent at v1), w = reserved
+	Vec4 uv2_; ///< xy = uv2
+	Vec4 t0_; ///< xyz = tangent at v0
+	Vec4 t1_; ///< xyz = tangent at v1
 };
 
 /*
@@ -116,10 +121,11 @@ public:
 	/*
 	 * @brief Upload BVH to GPU
 	 * @param node_buffer Buffer for BVH nodes
-	 * @param triangle_buffer Buffer for triangles
+	 * @param triangle_buffer Buffer for compact triangles (intersection only)
+	 * @param attr_buffer Buffer for triangle attributes (fetched on hit)
 	 * @return True if upload succeeded
 	 */
-	bool upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer);
+	bool upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer, Buffer &attr_buffer);
 
 	/*
 	 * @brief Get total node count
diff --git a/include/core/raytracer.h b/include/core/raytracer.h
index 293a3db..5049a21 100644
--- a/include/core/raytracer.h
+++ b/include/core/raytracer.h
@@ -110,7 +110,8 @@ private:
 	// BVH related
 	std::unique_ptr<BVH> bvh_;
 	Buffer bvh_node_buffer_;
-	Buffer bvh_triangle_buffer_;
+	Buffer bvh_triangle_buffer_; ///< Compact triangle data (intersection only)
+	Buffer bvh_attr_buffer_; ///< Triangle attributes (fetched on hit)
 	bool bvh_built_;
 
 	uint frame_count_;
diff --git a/shaders/include/bvh.glsl b/shaders/include/bvh.glsl
index 3af0922..0111230 100644
--- a/shaders/include/bvh.glsl
+++ b/shaders/include/bvh.glsl
@@ -38,14 +38,13 @@ bool intersect_aabb(Ray ray, vec3 aabb_min, vec3 aabb_max, float t_max) {
     return intersect_aabb_t(ray, aabb_min, aabb_max, t_max) >= 0.0;
 }
 
-// Moller-Trumbore triangle intersection
-bool intersect_triangle(Ray ray, TriangleGpu tri, inout HitInfo hit) {
+// Moller-Trumbore triangle intersection using compact triangle (precomputed edges)
+// Uses TriangleCompactGpu: v0_material, e1=v1-v0, e2=v2-v0
+bool intersect_triangle_compact(Ray ray, TriangleCompactGpu tri, inout HitInfo hit) {
     vec3 v0 = tri.v0_material.xyz;
-    vec3 v1 = tri.v1.xyz;
-    vec3 v2 = tri.v2.xyz;
+    vec3 e1 = tri.e1.xyz;
+    vec3 e2 = tri.e2.xyz;
 
-    vec3 e1 = v1 - v0;
-    vec3 e2 = v2 - v0;
     vec3 pvec = cross(ray.direction, e2);
     float det = dot(e1, pvec);
 
@@ -64,26 +63,37 @@ bool intersect_triangle(Ray ray, TriangleGpu tri, inout HitInfo hit) {
     if (t < EPSILON || t >= hit.t) return false;
 
     float w = 1.0 - u - v;
-    vec3 n0 = tri.n0.xyz;
-    vec3 n1 = tri.n1.xyz;
-    vec3 n2 = tri.n2.xyz;
 
-    vec2 uv0 = tri.uv0_uv1.xy;
-    vec2 uv1 = tri.uv0_uv1.zw;
-    vec2 uv2 = tri.uv2.xy;
-
-    vec3 t0 = tri.t0.xyz;
-    vec3 t1 = tri.t1.xyz;
-    vec3 t2 = normalize(cross(n0, t0));
+    // Fetch attributes only after confirmed hit
+    TriangleAttrGpu attr = bvh_attrs[gl_GlobalInvocationID.x];
+    // We need the triangle index, not invocation ID. Use a different approach.
 
     hit.hit = true;
     hit.t = t;
     hit.position = ray.origin + t * ray.direction;
+    hit.material_id = as_uint(tri.v0_material.w);
+    return true;
+}
+
+// Finalize hit with attributes (called after intersection confirmed)
+void finalize_hit(uint tri_idx, float u, float v, float w, inout HitInfo hit) {
+    TriangleAttrGpu attr = bvh_attrs[tri_idx];
+
+    vec3 n0 = attr.n0.xyz;
+    vec3 n1 = attr.n1.xyz;
+    vec3 n2 = attr.n2.xyz;
+
+    vec2 uv0 = attr.uv0_uv1.xy;
+    vec2 uv1 = attr.uv0_uv1.zw;
+    vec2 uv2 = attr.uv2.xy;
+
+    vec3 t0 = attr.t0.xyz;
+    vec3 t1 = attr.t1.xyz;
+    vec3 t2 = normalize(cross(n0, t0));
+
     hit.normal = normalize(n0 * w + n1 * u + n2 * v);
     hit.texcoord = uv0 * w + uv1 * u + uv2 * v;
     hit.tangent = normalize(t0 * w + t1 * u + t2 * v);
-    hit.material_id = as_uint(tri.v0_material.w);
-    return true;
 }
 
 // BVH traversal (closest hit) with distance-sorted children
@@ -92,6 +102,11 @@ HitInfo trace_ray_bvh(Ray ray) {
     hit.hit = false;
     hit.t = MAX_FLOAT;
 
+    // Track barycentric coords and triangle index for hit finalization
+    uint hit_tri_idx = 0u;
+    float hit_u = 0.0;
+    float hit_v = 0.0;
+
     if (!u_use_bvh || u_bvh_node_count == 0u) {
         return hit;
     }
@@ -114,12 +129,39 @@ HitInfo trace_ray_bvh(Ray ray) {
 
         if (count > 0u) {
             for (uint i = 0u; i < count; ++i) {
-                TriangleGpu tri = bvh_tris[left_first + i];
-                intersect_triangle(ray, tri, hit);
+                uint tri_idx = left_first + i;
+                TriangleCompactGpu tri = bvh_tris[tri_idx];
+                vec3 v0 = tri.v0_material.xyz;
+                vec3 e1 = tri.e1.xyz;
+                vec3 e2 = tri.e2.xyz;
+
+                vec3 pvec = cross(ray.direction, e2);
+                float det = dot(e1, pvec);
+
+                if (abs(det) < EPSILON) continue;
+                float inv_det = 1.0 / det;
+
+                vec3 tvec = ray.origin - v0;
+                float u = dot(tvec, pvec) * inv_det;
+                if (u < 0.0 || u > 1.0) continue;
+
+                vec3 qvec = cross(tvec, e1);
+                float v = dot(ray.direction, qvec) * inv_det;
+                if (v < 0.0 || u + v > 1.0) continue;
+
+                float t = dot(e2, qvec) * inv_det;
+                if (t < EPSILON || t >= hit.t) continue;
+
+                // Record hit but defer attribute fetch
+                hit.hit = true;
+                hit.t = t;
+                hit.position = ray.origin + t * ray.direction;
+                hit.material_id = as_uint(tri.v0_material.w);
+                hit_tri_idx = tri_idx;
+                hit_u = u;
+                hit_v = v;
             }
         } else {
-            // Distance-sorted child traversal: push farther child first
-            // so closer child is processed first, improving early termination
             uint left = left_first;
             uint right = left_first + 1u;
 
@@ -134,7 +176,6 @@ HitInfo trace_ray_bvh(Ray ray) {
             bool right_valid = t_right >= 0.0;
 
             if (left_valid && right_valid) {
-                // Both valid: push farther first
                 if (t_left < t_right) {
                     if (sp < 63) stack[sp++] = right;
                     if (sp < 63) stack[sp++] = left;
@@ -150,10 +191,32 @@ HitInfo trace_ray_bvh(Ray ray) {
         }
     }
 
+    // Fetch attributes only once for the final closest hit
+    if (hit.hit) {
+        float w = 1.0 - hit_u - hit_v;
+        TriangleAttrGpu attr = bvh_attrs[hit_tri_idx];
+
+        vec3 n0 = attr.n0.xyz;
+        vec3 n1 = attr.n1.xyz;
+        vec3 n2 = attr.n2.xyz;
+
+        vec2 uv0 = attr.uv0_uv1.xy;
+        vec2 uv1 = attr.uv0_uv1.zw;
+        vec2 uv2 = attr.uv2.xy;
+
+        vec3 t0 = attr.t0.xyz;
+        vec3 t1 = attr.t1.xyz;
+        vec3 t2 = normalize(cross(n0, t0));
+
+        hit.normal = normalize(n0 * w + n1 * hit_u + n2 * hit_v);
+        hit.texcoord = uv0 * w + uv1 * hit_u + uv2 * hit_v;
+        hit.tangent = normalize(t0 * w + t1 * hit_u + t2 * hit_v);
+    }
+
     return hit;
 }
 
-// Any-hit BVH for shadow ray (no sorting needed - early exit on first hit)
+// Any-hit BVH for shadow ray (no attribute fetch needed - early exit on first hit)
 bool trace_any_bvh(Ray ray, float t_max) {
     if (!u_use_bvh || u_bvh_node_count == 0u) return false;
 
@@ -161,10 +224,6 @@ bool trace_any_bvh(Ray ray, float t_max) {
     int sp = 0;
     stack[sp++] = 0u;
 
-    HitInfo hit;
-    hit.hit = false;
-    hit.t = t_max;
-
     while (sp > 0) {
         uint node_idx = stack[--sp];
         if (node_idx >= u_bvh_node_count) continue;
@@ -179,8 +238,29 @@ bool trace_any_bvh(Ray ray, float t_max) {
 
         if (count > 0u) {
             for (uint i = 0u; i < count; ++i) {
-                TriangleGpu tri = bvh_tris[left_first + i];
-                if (intersect_triangle(ray, tri, hit)) return true;
+                TriangleCompactGpu tri = bvh_tris[left_first + i];
+                vec3 v0 = tri.v0_material.xyz;
+                vec3 e1 = tri.e1.xyz;
+                vec3 e2 = tri.e2.xyz;
+
+                vec3 pvec = cross(ray.direction, e2);
+                float det = dot(e1, pvec);
+
+                if (abs(det) < EPSILON) continue;
+                float inv_det = 1.0 / det;
+
+                vec3 tvec = ray.origin - v0;
+                float u = dot(tvec, pvec) * inv_det;
+                if (u < 0.0 || u > 1.0) continue;
+
+                vec3 qvec = cross(tvec, e1);
+                float v = dot(ray.direction, qvec) * inv_det;
+                if (v < 0.0 || u + v > 1.0) continue;
+
+                float t = dot(e2, qvec) * inv_det;
+                if (t < EPSILON || t >= t_max) continue;
+
+                return true;
             }
         } else {
             uint left = left_first;
diff --git a/shaders/include/structs.glsl b/shaders/include/structs.glsl
index 11ce6f7..3bb92a8 100644
--- a/shaders/include/structs.glsl
+++ b/shaders/include/structs.glsl
@@ -53,6 +53,26 @@ struct BVHNodeGpu {
     vec4 aabb_max_count;
 };
 
+// Compact triangle for intersection testing (48 bytes = 3 x vec4)
+// Precomputes edge vectors e1 = v1-v0, e2 = v2-v0 for Moller-Trumbore
+struct TriangleCompactGpu {
+    vec4 v0_material; ///< xyz = v0 position, w = material_id
+    vec4 e1;          ///< xyz = v1 - v0 (precomputed)
+    vec4 e2;          ///< xyz = v2 - v0 (precomputed)
+};
+
+// Triangle attributes fetched only after confirmed hit (112 bytes = 7 x vec4)
+struct TriangleAttrGpu {
+    vec4 n0;          ///< xyz = normal at v0
+    vec4 n1;          ///< xyz = normal at v1
+    vec4 n2;          ///< xyz = normal at v2
+    vec4 uv0_uv1;     ///< xy = uv0, zw = uv1
+    vec4 uv2;         ///< xy = uv2
+    vec4 t0;          ///< xyz = tangent at v0
+    vec4 t1;          ///< xyz = tangent at v1
+};
+
+// Legacy full triangle layout (deprecated, kept for reference)
 struct TriangleGpu {
     vec4 v0_material;
     vec4 v1;
diff --git a/shaders/raytracing/raytracing.comp b/shaders/raytracing/raytracing.comp
index dd836b3..a981c9e 100644
--- a/shaders/raytracing/raytracing.comp
+++ b/shaders/raytracing/raytracing.comp
@@ -27,7 +27,8 @@ layout(binding = 4, rgba32f) uniform image2D accumulation_image;
 layout(std430, binding = 0) readonly buffer MaterialBuffer { Material materials[]; };
 layout(std430, binding = 1) readonly buffer LightBuffer { Light lights[]; };
 layout(std430, binding = 2) readonly buffer BVHNodeBuffer { BVHNodeGpu bvh_nodes[]; };
-layout(std430, binding = 3) readonly buffer TriangleBuffer { TriangleGpu bvh_tris[]; };
+layout(std430, binding = 3) readonly buffer TriangleBuffer { TriangleCompactGpu bvh_tris[]; };
+layout(std430, binding = 4) readonly buffer AttrBuffer { TriangleAttrGpu bvh_attrs[]; };
 
 // Uniforms
 uniform uint u_frame_count;
diff --git a/src/core/bvh.cpp b/src/core/bvh.cpp
index 5bedb84..184ffb6 100644
--- a/src/core/bvh.cpp
+++ b/src/core/bvh.cpp
@@ -85,8 +85,7 @@ bool BVH::build(const std::vector<std::shared_ptr<Mesh>> &meshes) {
 	// Build recursively
 	build_recursive_(0, 0, n);
 
-	ARE_LOG_INFO("BVH built: " + std::to_string(nodes_.size()) + " nodes, " +
-		std::to_string(triangles_.size()) + " triangles");
+	ARE_LOG_INFO("BVH built: " + std::to_string(nodes_.size()) + " nodes, " + std::to_string(triangles_.size()) + " triangles");
 
 	return true;
 }
@@ -308,8 +307,7 @@ float BVH::find_best_split_(uint first_prim, uint prim_count, int &axis, float &
 			// SAH cost: C_split = C_trav + (N_left * SA_left + N_right * SA_right) / SA_parent
 			float cost = 1.0f;
 			if (parent_sa > 0.0f) {
-				cost += (left_count * left_bounds.surface_area() +
-					right_count * right_bounds.surface_area()) / parent_sa;
+				cost += (left_count * left_bounds.surface_area() + right_count * right_bounds.surface_area()) / parent_sa;
 			}
 
 			if (cost < best_cost) {
@@ -347,7 +345,7 @@ AABB BVH::calculate_centroid_bounds_(uint first_prim, uint prim_count) {
 	return bounds;
 }
 
-bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) {
+bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer, Buffer &attr_buffer) {
 	if (nodes_.empty() || triangles_.empty()) {
 		ARE_LOG_ERROR("Cannot upload empty BVH to GPU");
 		return false;
@@ -371,28 +369,36 @@ bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) {
 		node_gpu[i] = g;
 	}
 
-	// Pack triangles to GPU layout
-	std::vector<TriangleGpu> tri_gpu;
-	tri_gpu.resize(ordered_triangles.size());
+	// Pack compact triangles (intersection only, 48 bytes each)
+	std::vector<TriangleCompactGpu> tri_compact;
+	tri_compact.resize(ordered_triangles.size());
 	for (size_t i = 0; i < ordered_triangles.size(); ++i) {
 		const Triangle &t = ordered_triangles[i];
 
-		TriangleGpu g {};
+		TriangleCompactGpu g {};
 		g.v0_material_ = Vec4(t.v0_, glm::uintBitsToFloat(t.material_id_));
-		g.v1_ = Vec4(t.v1_, 0.0f);
-		g.v2_ = Vec4(t.v2_, 0.0f);
+		g.e1_ = Vec4(t.v1_ - t.v0_, 0.0f);
+		g.e2_ = Vec4(t.v2_ - t.v0_, 0.0f);
 
+		tri_compact[i] = g;
+	}
+
+	// Pack triangle attributes (fetched only on hit, 112 bytes each)
+	std::vector<TriangleAttrGpu> tri_attr;
+	tri_attr.resize(ordered_triangles.size());
+	for (size_t i = 0; i < ordered_triangles.size(); ++i) {
+		const Triangle &t = ordered_triangles[i];
+
+		TriangleAttrGpu g {};
 		g.n0_ = Vec4(t.n0_, 0.0f);
 		g.n1_ = Vec4(t.n1_, 0.0f);
 		g.n2_ = Vec4(t.n2_, 0.0f);
-
 		g.uv0_uv1_ = Vec4(t.uv0_.x, t.uv0_.y, t.uv1_.x, t.uv1_.y);
 		g.uv2_ = Vec4(t.uv2_.x, t.uv2_.y, 0.0f, 0.0f);
-
 		g.t0_ = Vec4(t.t0_, 0.0f);
 		g.t1_ = Vec4(t.t1_, 0.0f);
 
-		tri_gpu[i] = g;
+		tri_attr[i] = g;
 	}
 
 	if (!node_buffer.create(BufferType::SHADER_STORAGE_BUFFER,
@@ -404,14 +410,22 @@ bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) {
 	}
 
 	if (!triangle_buffer.create(BufferType::SHADER_STORAGE_BUFFER,
-			tri_gpu.size() * sizeof(TriangleGpu),
-			tri_gpu.data(),
+			tri_compact.size() * sizeof(TriangleCompactGpu),
+			tri_compact.data(),
 			BufferUsage::STATIC_DRAW)) {
-		ARE_LOG_ERROR("Failed to upload BVH triangles to GPU");
+		ARE_LOG_ERROR("Failed to upload BVH compact triangles to GPU");
 		return false;
 	}
 
-	ARE_LOG_INFO("BVH uploaded to GPU successfully");
+	if (!attr_buffer.create(BufferType::SHADER_STORAGE_BUFFER,
+			tri_attr.size() * sizeof(TriangleAttrGpu),
+			tri_attr.data(),
+			BufferUsage::STATIC_DRAW)) {
+		ARE_LOG_ERROR("Failed to upload BVH triangle attributes to GPU");
+		return false;
+	}
+
+	ARE_LOG_INFO("BVH uploaded to GPU: " + std::to_string(nodes_.size()) + " nodes, " + std::to_string(ordered_triangles.size()) + " triangles (" + std::to_string(tri_compact.size() * sizeof(TriangleCompactGpu) / 1024) + "KB compact + " + std::to_string(tri_attr.size() * sizeof(TriangleAttrGpu) / 1024) + "KB attr)");
 	return true;
 }
 
diff --git a/src/core/raytracer.cpp b/src/core/raytracer.cpp
index b38f4dd..0204285 100644
--- a/src/core/raytracer.cpp
+++ b/src/core/raytracer.cpp
@@ -124,6 +124,7 @@ void RayTracer::release() {
 
 	bvh_node_buffer_.release();
 	bvh_triangle_buffer_.release();
+	bvh_attr_buffer_.release();
 
 	bvh_.reset();
 	bvh_built_ = false;
@@ -149,7 +150,7 @@ bool RayTracer::rebuild_bvh(const Scene &scene) {
 		return false;
 	}
 
-	if (!bvh_->upload_to_gpu(bvh_node_buffer_, bvh_triangle_buffer_)) {
+	if (!bvh_->upload_to_gpu(bvh_node_buffer_, bvh_triangle_buffer_, bvh_attr_buffer_)) {
 		ARE_LOG_ERROR("Failed to upload BVH to GPU");
 		return false;
 	}
@@ -224,6 +225,7 @@ void RayTracer::trace(const Scene &scene, const GBuffer &gbuffer, TextureHandle
 	if (config_.use_bvh_ && bvh_built_) {
 		bvh_node_buffer_.bind_base(2);
 		bvh_triangle_buffer_.bind_base(3);
+		bvh_attr_buffer_.bind_base(4);
 		compute_shader_->set_bool("u_use_bvh", true);
 		compute_shader_->set_uint("u_bvh_node_count", bvh_->get_node_count());
 	} else {
@@ -412,7 +414,7 @@ void RayTracer::upload_scene_data_(const Scene &scene) {
 
 void RayTracer::bind_gbuffer_(const GBuffer &gbuffer) {
 	glBindImageTexture(0, gbuffer.get_texture(GBUFFER_POSITION), 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA32F);
-	glBindImageTexture(1, gbuffer.get_texture(GBUFFER_NORMAL), 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F);  // Octahedral encoded
+	glBindImageTexture(1, gbuffer.get_texture(GBUFFER_NORMAL), 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); // Octahedral encoded
 
 	glBindImageTexture(5, gbuffer.get_texture(GBUFFER_MATERIAL), 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA32F);
 	glBindImageTexture(6, gbuffer.get_texture(GBUFFER_MATERIAL_ID), 0, GL_FALSE, 0, GL_READ_ONLY, GL_R32UI);