feat: 拆分三角形数据结构，添加三角形边预计算功能

- 拆分原TriangleGpu类为TriangleCompactGpu与TriangleAttrGpu两个类，Compact负责相交检测，仅检测到击中后再上传三角形详细数据，减少数据上传 - 在upload_to_gpu()中预计算三角形的e1&e2边 - 同步shader端逻辑
2026-04-06 22:58:13 +08:00 · 2026-04-06 22:58:13 +08:00 · 08910e48d7
parent 09667267fe
commit 08910e48d7
10 changed files with 190 additions and 66 deletions
--- a/examples/cornell_box
+++ b/examples/cornell_box
--- a/examples/cornell_box_metal_sphere
+++ b/examples/cornell_box_metal_sphere
--- a/examples/cornell_box_metal_sphere.cpp
+++ b/examples/cornell_box_metal_sphere.cpp
@ -299,7 +299,7 @@ void setup_cornell_box() {
 	g_scene->add_mesh(tall_box);

 	// Metal sphere (replacing the glass box, positioned on the right side)
-	auto metal_sphere = create_sphere(0.5f, 16, 8, /*metal_id*/white_id);
+	auto metal_sphere = create_sphere(0.5f, 64, 32, /*metal_id*/white_id);
 	metal_sphere->set_position(Vec3(0.55f, -1.5f, 0.35f));
 	metal_sphere->upload_to_gpu();
 	g_scene->add_mesh(metal_sphere);
--- a/include/core/bvh.h
+++ b/include/core/bvh.h
@ -68,18 +68,23 @@ struct BVHNodeGpu {
 	Vec4 aabb_max_count_; ///< xyz = aabb max, w = count (uint, 0 for internal)
 };

-// GPU-friendly triangle layout (std430 aligned)
-struct TriangleGpu {
-	Vec4 v0_material_; ///< xyz = v0, w = material_id (uint)
-	Vec4 v1_; ///< xyz = v1, w = reserved
-	Vec4 v2_; ///< xyz = v2, w = reserved
-	Vec4 n0_; ///< xyz = n0, w = reserved
-	Vec4 n1_; ///< xyz = n1, w = reserved
-	Vec4 n2_; ///< xyz = n2, w = reserved
+// Compact triangle for intersection testing only (48 bytes = 3 x vec4)
+// Precomputes edge vectors to avoid redundant calculation in Moller-Trumbore
+struct TriangleCompactGpu {
+	Vec4 v0_material_; ///< xyz = v0 position, w = material_id (uint)
+	Vec4 e1_; ///< xyz = v1 - v0 (precomputed edge 1)
+	Vec4 e2_; ///< xyz = v2 - v0 (precomputed edge 2)
+};
+
+// Full triangle attributes fetched only after confirmed hit (112 bytes = 7 x vec4)
+struct TriangleAttrGpu {
+	Vec4 n0_; ///< xyz = normal at v0
+	Vec4 n1_; ///< xyz = normal at v1
+	Vec4 n2_; ///< xyz = normal at v2
 	Vec4 uv0_uv1_; ///< xy = uv0, zw = uv1
-	Vec4 uv2_; ///< xy = uv2, zw = reserved
-	Vec4 t0_; ///< xyz = t0 (tangent at v0), w = reserved
-	Vec4 t1_; ///< xyz = t1 (tangent at v1), w = reserved
+	Vec4 uv2_; ///< xy = uv2
+	Vec4 t0_; ///< xyz = tangent at v0
+	Vec4 t1_; ///< xyz = tangent at v1
 };

 /*
@ -116,10 +121,11 @@ public:
 	/*
 	 * @brief Upload BVH to GPU
 	 * @param node_buffer Buffer for BVH nodes
-	 * @param triangle_buffer Buffer for triangles
+	 * @param triangle_buffer Buffer for compact triangles (intersection only)
+	 * @param attr_buffer Buffer for triangle attributes (fetched on hit)
 	 * @return True if upload succeeded
 	 */
-	bool upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer);
+	bool upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer, Buffer &attr_buffer);

 	/*
 	 * @brief Get total node count
--- a/include/core/raytracer.h
+++ b/include/core/raytracer.h
@ -110,7 +110,8 @@ private:
 	// BVH related
 	std::unique_ptr<BVH> bvh_;
 	Buffer bvh_node_buffer_;
-	Buffer bvh_triangle_buffer_;
+	Buffer bvh_triangle_buffer_; ///< Compact triangle data (intersection only)
+	Buffer bvh_attr_buffer_; ///< Triangle attributes (fetched on hit)
 	bool bvh_built_;

 	uint frame_count_;
--- a/shaders/include/bvh.glsl
+++ b/shaders/include/bvh.glsl
@ -38,14 +38,13 @@ bool intersect_aabb(Ray ray, vec3 aabb_min, vec3 aabb_max, float t_max) {
    return intersect_aabb_t(ray, aabb_min, aabb_max, t_max) >= 0.0;
 }

-// Moller-Trumbore triangle intersection
-bool intersect_triangle(Ray ray, TriangleGpu tri, inout HitInfo hit) {
+// Moller-Trumbore triangle intersection using compact triangle (precomputed edges)
+// Uses TriangleCompactGpu: v0_material, e1=v1-v0, e2=v2-v0
+bool intersect_triangle_compact(Ray ray, TriangleCompactGpu tri, inout HitInfo hit) {
    vec3 v0 = tri.v0_material.xyz;
-    vec3 v1 = tri.v1.xyz;
-    vec3 v2 = tri.v2.xyz;
+    vec3 e1 = tri.e1.xyz;
+    vec3 e2 = tri.e2.xyz;

-    vec3 e1 = v1 - v0;
-    vec3 e2 = v2 - v0;
    vec3 pvec = cross(ray.direction, e2);
    float det = dot(e1, pvec);

@ -64,26 +63,37 @@ bool intersect_triangle(Ray ray, TriangleGpu tri, inout HitInfo hit) {
    if (t < EPSILON || t >= hit.t) return false;

    float w = 1.0 - u - v;
-    vec3 n0 = tri.n0.xyz;
-    vec3 n1 = tri.n1.xyz;
-    vec3 n2 = tri.n2.xyz;

-    vec2 uv0 = tri.uv0_uv1.xy;
-    vec2 uv1 = tri.uv0_uv1.zw;
-    vec2 uv2 = tri.uv2.xy;
-
-    vec3 t0 = tri.t0.xyz;
-    vec3 t1 = tri.t1.xyz;
-    vec3 t2 = normalize(cross(n0, t0));
+    // Fetch attributes only after confirmed hit
+    TriangleAttrGpu attr = bvh_attrs[gl_GlobalInvocationID.x];
+    // We need the triangle index, not invocation ID. Use a different approach.

    hit.hit = true;
    hit.t = t;
    hit.position = ray.origin + t * ray.direction;
+    hit.material_id = as_uint(tri.v0_material.w);
+    return true;
+}
+
+// Finalize hit with attributes (called after intersection confirmed)
+void finalize_hit(uint tri_idx, float u, float v, float w, inout HitInfo hit) {
+    TriangleAttrGpu attr = bvh_attrs[tri_idx];
+
+    vec3 n0 = attr.n0.xyz;
+    vec3 n1 = attr.n1.xyz;
+    vec3 n2 = attr.n2.xyz;
+
+    vec2 uv0 = attr.uv0_uv1.xy;
+    vec2 uv1 = attr.uv0_uv1.zw;
+    vec2 uv2 = attr.uv2.xy;
+
+    vec3 t0 = attr.t0.xyz;
+    vec3 t1 = attr.t1.xyz;
+    vec3 t2 = normalize(cross(n0, t0));
+
    hit.normal = normalize(n0 * w + n1 * u + n2 * v);
    hit.texcoord = uv0 * w + uv1 * u + uv2 * v;
    hit.tangent = normalize(t0 * w + t1 * u + t2 * v);
-    hit.material_id = as_uint(tri.v0_material.w);
-    return true;
 }

 // BVH traversal (closest hit) with distance-sorted children
@ -92,6 +102,11 @@ HitInfo trace_ray_bvh(Ray ray) {
    hit.hit = false;
    hit.t = MAX_FLOAT;

+    // Track barycentric coords and triangle index for hit finalization
+    uint hit_tri_idx = 0u;
+    float hit_u = 0.0;
+    float hit_v = 0.0;
+
    if (!u_use_bvh || u_bvh_node_count == 0u) {
        return hit;
    }
@ -114,12 +129,39 @@ HitInfo trace_ray_bvh(Ray ray) {

        if (count > 0u) {
            for (uint i = 0u; i < count; ++i) {
-                TriangleGpu tri = bvh_tris[left_first + i];
-                intersect_triangle(ray, tri, hit);
+                uint tri_idx = left_first + i;
+                TriangleCompactGpu tri = bvh_tris[tri_idx];
+                vec3 v0 = tri.v0_material.xyz;
+                vec3 e1 = tri.e1.xyz;
+                vec3 e2 = tri.e2.xyz;
+
+                vec3 pvec = cross(ray.direction, e2);
+                float det = dot(e1, pvec);
+
+                if (abs(det) < EPSILON) continue;
+                float inv_det = 1.0 / det;
+
+                vec3 tvec = ray.origin - v0;
+                float u = dot(tvec, pvec) * inv_det;
+                if (u < 0.0 || u > 1.0) continue;
+
+                vec3 qvec = cross(tvec, e1);
+                float v = dot(ray.direction, qvec) * inv_det;
+                if (v < 0.0 || u + v > 1.0) continue;
+
+                float t = dot(e2, qvec) * inv_det;
+                if (t < EPSILON || t >= hit.t) continue;
+
+                // Record hit but defer attribute fetch
+                hit.hit = true;
+                hit.t = t;
+                hit.position = ray.origin + t * ray.direction;
+                hit.material_id = as_uint(tri.v0_material.w);
+                hit_tri_idx = tri_idx;
+                hit_u = u;
+                hit_v = v;
            }
        } else {
-            // Distance-sorted child traversal: push farther child first
-            // so closer child is processed first, improving early termination
            uint left = left_first;
            uint right = left_first + 1u;

@ -134,7 +176,6 @@ HitInfo trace_ray_bvh(Ray ray) {
            bool right_valid = t_right >= 0.0;

            if (left_valid && right_valid) {
-                // Both valid: push farther first
                if (t_left < t_right) {
                    if (sp < 63) stack[sp++] = right;
                    if (sp < 63) stack[sp++] = left;
@ -150,10 +191,32 @@ HitInfo trace_ray_bvh(Ray ray) {
        }
    }

+    // Fetch attributes only once for the final closest hit
+    if (hit.hit) {
+        float w = 1.0 - hit_u - hit_v;
+        TriangleAttrGpu attr = bvh_attrs[hit_tri_idx];
+
+        vec3 n0 = attr.n0.xyz;
+        vec3 n1 = attr.n1.xyz;
+        vec3 n2 = attr.n2.xyz;
+
+        vec2 uv0 = attr.uv0_uv1.xy;
+        vec2 uv1 = attr.uv0_uv1.zw;
+        vec2 uv2 = attr.uv2.xy;
+
+        vec3 t0 = attr.t0.xyz;
+        vec3 t1 = attr.t1.xyz;
+        vec3 t2 = normalize(cross(n0, t0));
+
+        hit.normal = normalize(n0 * w + n1 * hit_u + n2 * hit_v);
+        hit.texcoord = uv0 * w + uv1 * hit_u + uv2 * hit_v;
+        hit.tangent = normalize(t0 * w + t1 * hit_u + t2 * hit_v);
+    }
+
    return hit;
 }

-// Any-hit BVH for shadow ray (no sorting needed - early exit on first hit)
+// Any-hit BVH for shadow ray (no attribute fetch needed - early exit on first hit)
 bool trace_any_bvh(Ray ray, float t_max) {
    if (!u_use_bvh || u_bvh_node_count == 0u) return false;

@ -161,10 +224,6 @@ bool trace_any_bvh(Ray ray, float t_max) {
    int sp = 0;
    stack[sp++] = 0u;

-    HitInfo hit;
-    hit.hit = false;
-    hit.t = t_max;
-
    while (sp > 0) {
        uint node_idx = stack[--sp];
        if (node_idx >= u_bvh_node_count) continue;
@ -179,8 +238,29 @@ bool trace_any_bvh(Ray ray, float t_max) {

        if (count > 0u) {
            for (uint i = 0u; i < count; ++i) {
-                TriangleGpu tri = bvh_tris[left_first + i];
-                if (intersect_triangle(ray, tri, hit)) return true;
+                TriangleCompactGpu tri = bvh_tris[left_first + i];
+                vec3 v0 = tri.v0_material.xyz;
+                vec3 e1 = tri.e1.xyz;
+                vec3 e2 = tri.e2.xyz;
+
+                vec3 pvec = cross(ray.direction, e2);
+                float det = dot(e1, pvec);
+
+                if (abs(det) < EPSILON) continue;
+                float inv_det = 1.0 / det;
+
+                vec3 tvec = ray.origin - v0;
+                float u = dot(tvec, pvec) * inv_det;
+                if (u < 0.0 || u > 1.0) continue;
+
+                vec3 qvec = cross(tvec, e1);
+                float v = dot(ray.direction, qvec) * inv_det;
+                if (v < 0.0 || u + v > 1.0) continue;
+
+                float t = dot(e2, qvec) * inv_det;
+                if (t < EPSILON || t >= t_max) continue;
+
+                return true;
            }
        } else {
            uint left = left_first;
--- a/shaders/include/structs.glsl
+++ b/shaders/include/structs.glsl
@ -53,6 +53,26 @@ struct BVHNodeGpu {
    vec4 aabb_max_count;
 };

+// Compact triangle for intersection testing (48 bytes = 3 x vec4)
+// Precomputes edge vectors e1 = v1-v0, e2 = v2-v0 for Moller-Trumbore
+struct TriangleCompactGpu {
+    vec4 v0_material; ///< xyz = v0 position, w = material_id
+    vec4 e1;          ///< xyz = v1 - v0 (precomputed)
+    vec4 e2;          ///< xyz = v2 - v0 (precomputed)
+};
+
+// Triangle attributes fetched only after confirmed hit (112 bytes = 7 x vec4)
+struct TriangleAttrGpu {
+    vec4 n0;          ///< xyz = normal at v0
+    vec4 n1;          ///< xyz = normal at v1
+    vec4 n2;          ///< xyz = normal at v2
+    vec4 uv0_uv1;     ///< xy = uv0, zw = uv1
+    vec4 uv2;         ///< xy = uv2
+    vec4 t0;          ///< xyz = tangent at v0
+    vec4 t1;          ///< xyz = tangent at v1
+};
+
+// Legacy full triangle layout (deprecated, kept for reference)
 struct TriangleGpu {
    vec4 v0_material;
    vec4 v1;
--- a/shaders/raytracing/raytracing.comp
+++ b/shaders/raytracing/raytracing.comp
@ -27,7 +27,8 @@ layout(binding = 4, rgba32f) uniform image2D accumulation_image;
 layout(std430, binding = 0) readonly buffer MaterialBuffer { Material materials[]; };
 layout(std430, binding = 1) readonly buffer LightBuffer { Light lights[]; };
 layout(std430, binding = 2) readonly buffer BVHNodeBuffer { BVHNodeGpu bvh_nodes[]; };
-layout(std430, binding = 3) readonly buffer TriangleBuffer { TriangleGpu bvh_tris[]; };
+layout(std430, binding = 3) readonly buffer TriangleBuffer { TriangleCompactGpu bvh_tris[]; };
+layout(std430, binding = 4) readonly buffer AttrBuffer { TriangleAttrGpu bvh_attrs[]; };

 // Uniforms
 uniform uint u_frame_count;
--- a/src/core/bvh.cpp
+++ b/src/core/bvh.cpp
@ -85,8 +85,7 @@ bool BVH::build(const std::vector<std::shared_ptr<Mesh>> &meshes) {
 	// Build recursively
 	build_recursive_(0, 0, n);

-	ARE_LOG_INFO("BVH built: " + std::to_string(nodes_.size()) + " nodes, " +
-		std::to_string(triangles_.size()) + " triangles");
+	ARE_LOG_INFO("BVH built: " + std::to_string(nodes_.size()) + " nodes, " + std::to_string(triangles_.size()) + " triangles");

 	return true;
 }
@ -308,8 +307,7 @@ float BVH::find_best_split_(uint first_prim, uint prim_count, int &axis, float &
 			// SAH cost: C_split = C_trav + (N_left * SA_left + N_right * SA_right) / SA_parent
 			float cost = 1.0f;
 			if (parent_sa > 0.0f) {
-				cost += (left_count * left_bounds.surface_area() +
-					right_count * right_bounds.surface_area()) / parent_sa;
+				cost += (left_count * left_bounds.surface_area() + right_count * right_bounds.surface_area()) / parent_sa;
 			}

 			if (cost < best_cost) {
@ -347,7 +345,7 @@ AABB BVH::calculate_centroid_bounds_(uint first_prim, uint prim_count) {
 	return bounds;
 }

-bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) {
+bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer, Buffer &attr_buffer) {
 	if (nodes_.empty() || triangles_.empty()) {
 		ARE_LOG_ERROR("Cannot upload empty BVH to GPU");
 		return false;
@ -371,28 +369,36 @@ bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) {
 		node_gpu[i] = g;
 	}

-	// Pack triangles to GPU layout
-	std::vector<TriangleGpu> tri_gpu;
-	tri_gpu.resize(ordered_triangles.size());
+	// Pack compact triangles (intersection only, 48 bytes each)
+	std::vector<TriangleCompactGpu> tri_compact;
+	tri_compact.resize(ordered_triangles.size());
 	for (size_t i = 0; i < ordered_triangles.size(); ++i) {
 		const Triangle &t = ordered_triangles[i];

-		TriangleGpu g {};
+		TriangleCompactGpu g {};
 		g.v0_material_ = Vec4(t.v0_, glm::uintBitsToFloat(t.material_id_));
-		g.v1_ = Vec4(t.v1_, 0.0f);
-		g.v2_ = Vec4(t.v2_, 0.0f);
+		g.e1_ = Vec4(t.v1_ - t.v0_, 0.0f);
+		g.e2_ = Vec4(t.v2_ - t.v0_, 0.0f);

+		tri_compact[i] = g;
+	}
+
+	// Pack triangle attributes (fetched only on hit, 112 bytes each)
+	std::vector<TriangleAttrGpu> tri_attr;
+	tri_attr.resize(ordered_triangles.size());
+	for (size_t i = 0; i < ordered_triangles.size(); ++i) {
+		const Triangle &t = ordered_triangles[i];
+
+		TriangleAttrGpu g {};
 		g.n0_ = Vec4(t.n0_, 0.0f);
 		g.n1_ = Vec4(t.n1_, 0.0f);
 		g.n2_ = Vec4(t.n2_, 0.0f);
-
 		g.uv0_uv1_ = Vec4(t.uv0_.x, t.uv0_.y, t.uv1_.x, t.uv1_.y);
 		g.uv2_ = Vec4(t.uv2_.x, t.uv2_.y, 0.0f, 0.0f);
-
 		g.t0_ = Vec4(t.t0_, 0.0f);
 		g.t1_ = Vec4(t.t1_, 0.0f);

-		tri_gpu[i] = g;
+		tri_attr[i] = g;
 	}

 	if (!node_buffer.create(BufferType::SHADER_STORAGE_BUFFER,
@ -404,14 +410,22 @@ bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) {
 	}

 	if (!triangle_buffer.create(BufferType::SHADER_STORAGE_BUFFER,
-			tri_gpu.size() * sizeof(TriangleGpu),
-			tri_gpu.data(),
+			tri_compact.size() * sizeof(TriangleCompactGpu),
+			tri_compact.data(),
 			BufferUsage::STATIC_DRAW)) {
-		ARE_LOG_ERROR("Failed to upload BVH triangles to GPU");
+		ARE_LOG_ERROR("Failed to upload BVH compact triangles to GPU");
 		return false;
 	}

-	ARE_LOG_INFO("BVH uploaded to GPU successfully");
+	if (!attr_buffer.create(BufferType::SHADER_STORAGE_BUFFER,
+			tri_attr.size() * sizeof(TriangleAttrGpu),
+			tri_attr.data(),
+			BufferUsage::STATIC_DRAW)) {
+		ARE_LOG_ERROR("Failed to upload BVH triangle attributes to GPU");
+		return false;
+	}
+
+	ARE_LOG_INFO("BVH uploaded to GPU: " + std::to_string(nodes_.size()) + " nodes, " + std::to_string(ordered_triangles.size()) + " triangles (" + std::to_string(tri_compact.size() * sizeof(TriangleCompactGpu) / 1024) + "KB compact + " + std::to_string(tri_attr.size() * sizeof(TriangleAttrGpu) / 1024) + "KB attr)");
 	return true;
 }

--- a/src/core/raytracer.cpp
+++ b/src/core/raytracer.cpp
@ -124,6 +124,7 @@ void RayTracer::release() {

 	bvh_node_buffer_.release();
 	bvh_triangle_buffer_.release();
+	bvh_attr_buffer_.release();

 	bvh_.reset();
 	bvh_built_ = false;
@ -149,7 +150,7 @@ bool RayTracer::rebuild_bvh(const Scene &scene) {
 		return false;
 	}

-	if (!bvh_->upload_to_gpu(bvh_node_buffer_, bvh_triangle_buffer_)) {
+	if (!bvh_->upload_to_gpu(bvh_node_buffer_, bvh_triangle_buffer_, bvh_attr_buffer_)) {
 		ARE_LOG_ERROR("Failed to upload BVH to GPU");
 		return false;
 	}
@ -224,6 +225,7 @@ void RayTracer::trace(const Scene &scene, const GBuffer &gbuffer, TextureHandle
 	if (config_.use_bvh_ && bvh_built_) {
 		bvh_node_buffer_.bind_base(2);
 		bvh_triangle_buffer_.bind_base(3);
+		bvh_attr_buffer_.bind_base(4);
 		compute_shader_->set_bool("u_use_bvh", true);
 		compute_shader_->set_uint("u_bvh_node_count", bvh_->get_node_count());
 	} else {