diff --git a/examples/cornell_box_metal_sphere b/examples/cornell_box_metal_sphere index 0fafc48..30920a9 100644 Binary files a/examples/cornell_box_metal_sphere and b/examples/cornell_box_metal_sphere differ diff --git a/include/core/bvh.h b/include/core/bvh.h index 411fead..22b231e 100644 --- a/include/core/bvh.h +++ b/include/core/bvh.h @@ -53,17 +53,19 @@ struct Triangle { }; // BVH node for GPU +// Internal node: left_first_ = left child index, count_ = 0 (right child = left_first_ + 1) +// Leaf node: left_first_ = triangle offset in sorted array, count_ = triangle count struct BVHNode { Vec3 aabb_min_; - uint left_first_; // Left child index or first primitive index + uint left_first_; // Left child index (internal) or first primitive index (leaf) Vec3 aabb_max_; - uint count_; // 0 for interior node, >0 for leaf node + uint count_; // 0 for internal node, >0 for leaf (triangle count) }; // GPU-friendly BVH node layout (std430 aligned) struct BVHNodeGpu { Vec4 aabb_min_left_first_; ///< xyz = aabb min, w = left_first (uint) - Vec4 aabb_max_count_; ///< xyz = aabb max, w = count (uint, 0 for interior) + Vec4 aabb_max_count_; ///< xyz = aabb max, w = count (uint, 0 for internal) }; // GPU-friendly triangle layout (std430 aligned) @@ -80,7 +82,22 @@ struct TriangleGpu { Vec4 t1_; ///< xyz = t1 (tangent at v1), w = reserved }; -// Bounding Volume Hierarchy for ray tracing acceleration +/* + * @brief Bounding Volume Hierarchy using top-down SAH construction + * + * Algorithm: + * 1. Extract triangles from meshes and transform to world space + * 2. Sort triangles by Morton code for spatial coherence + * 3. Build BVH top-down using SAH (Surface Area Heuristic) with 16-bin evaluation + * 4. Node layout ensures children are at consecutive indices for GPU efficiency + * + * Node layout (GPU-friendly): + * - Internal nodes: left_first_ = left child index, right = left_first_ + 1 + * - Leaf nodes: left_first_ = triangle offset, count_ = triangle count + * + * Time complexity: O(n log n) average with SAH binning + * Space complexity: O(n) + */ class BVH { public: // Constructor @@ -126,39 +143,43 @@ public: private: std::vector nodes_; std::vector triangles_; - std::vector triangle_indices_; + std::vector triangle_indices_; // Indirection array for partitioning /* - * @brief Recursively build BVH - * @param node_idx Current node index - * @param first_prim First primitive index - * @param prim_count Primitive count + * @brief Extract triangles from meshes and transform to world space + */ + void extract_triangles_(const std::vector> &meshes); + + /* + * @brief Sort triangles by Morton code for spatial coherence + */ + void sort_triangles_by_morton_(); + + /* + * @brief Recursively build BVH using SAH + * @param node_idx Current node index to fill + * @param first_prim First primitive index in triangle_indices_ + * @param prim_count Number of primitives */ void build_recursive_(uint node_idx, uint first_prim, uint prim_count); /* - * @brief Find best split using SAH + * @brief Find best split using SAH with binning * @param first_prim First primitive index * @param prim_count Primitive count - * @param axis Split axis (output) - * @param split_pos Split position (output) - * @return Split cost + * @param axis Best split axis (output) + * @param split_pos Best split position (output) + * @return SAH cost of best split */ float find_best_split_(uint first_prim, uint prim_count, int &axis, float &split_pos); /* * @brief Calculate node bounds - * @param first_prim First primitive index - * @param prim_count Primitive count - * @return Bounding box */ AABB calculate_bounds_(uint first_prim, uint prim_count); /* * @brief Calculate centroid bounds - * @param first_prim First primitive index - * @param prim_count Primitive count - * @return Centroid bounding box */ AABB calculate_centroid_bounds_(uint first_prim, uint prim_count); }; diff --git a/shaders/include/bvh.glsl b/shaders/include/bvh.glsl index ffbc39d..3af0922 100644 --- a/shaders/include/bvh.glsl +++ b/shaders/include/bvh.glsl @@ -15,8 +15,8 @@ vec3 oct_decode(vec2 f) { return normalize(n); } -// Ray-AABB intersection -bool intersect_aabb(Ray ray, vec3 aabb_min, vec3 aabb_max, float t_max) { +// Ray-AABB intersection: returns t_enter if hit, -1.0 if miss +float intersect_aabb_t(Ray ray, vec3 aabb_min, vec3 aabb_max, float t_max) { vec3 inv_d = 1.0 / ray.direction; vec3 t0 = (aabb_min - ray.origin) * inv_d; vec3 t1 = (aabb_max - ray.origin) * inv_d; @@ -27,7 +27,15 @@ bool intersect_aabb(Ray ray, vec3 aabb_min, vec3 aabb_max, float t_max) { float tmin = max(max(tmin3.x, tmin3.y), tmin3.z); float tmax2 = min(min(tmax3.x, tmax3.y), tmax3.z); - return (tmax2 >= max(tmin, 0.0)) && (tmin <= t_max); + if ((tmax2 >= max(tmin, 0.0)) && (tmin <= t_max)) { + return max(tmin, 0.0); + } + return -1.0; +} + +// Ray-AABB intersection (boolean version for shadow rays) +bool intersect_aabb(Ray ray, vec3 aabb_min, vec3 aabb_max, float t_max) { + return intersect_aabb_t(ray, aabb_min, aabb_max, t_max) >= 0.0; } // Moller-Trumbore triangle intersection @@ -78,7 +86,7 @@ bool intersect_triangle(Ray ray, TriangleGpu tri, inout HitInfo hit) { return true; } -// BVH traversal (closest hit) +// BVH traversal (closest hit) with distance-sorted children HitInfo trace_ray_bvh(Ray ray) { HitInfo hit; hit.hit = false; @@ -110,17 +118,42 @@ HitInfo trace_ray_bvh(Ray ray) { intersect_triangle(ray, tri, hit); } } else { + // Distance-sorted child traversal: push farther child first + // so closer child is processed first, improving early termination uint left = left_first; uint right = left_first + 1u; - if (sp < 63) stack[sp++] = right; - if (sp < 63) stack[sp++] = left; + + float t_left = intersect_aabb_t(ray, + bvh_nodes[left].aabb_min_left_first.xyz, + bvh_nodes[left].aabb_max_count.xyz, hit.t); + float t_right = intersect_aabb_t(ray, + bvh_nodes[right].aabb_min_left_first.xyz, + bvh_nodes[right].aabb_max_count.xyz, hit.t); + + bool left_valid = t_left >= 0.0; + bool right_valid = t_right >= 0.0; + + if (left_valid && right_valid) { + // Both valid: push farther first + if (t_left < t_right) { + if (sp < 63) stack[sp++] = right; + if (sp < 63) stack[sp++] = left; + } else { + if (sp < 63) stack[sp++] = left; + if (sp < 63) stack[sp++] = right; + } + } else if (left_valid) { + if (sp < 63) stack[sp++] = left; + } else if (right_valid) { + if (sp < 63) stack[sp++] = right; + } } } return hit; } -// Any-hit BVH for shadow ray +// Any-hit BVH for shadow ray (no sorting needed - early exit on first hit) bool trace_any_bvh(Ray ray, float t_max) { if (!u_use_bvh || u_bvh_node_count == 0u) return false; @@ -142,7 +175,7 @@ bool trace_any_bvh(Ray ray, float t_max) { uint left_first = as_uint(node.aabb_min_left_first.w); uint count = as_uint(node.aabb_max_count.w); - if (!intersect_aabb(ray, bmin, bmax, hit.t)) continue; + if (!intersect_aabb(ray, bmin, bmax, t_max)) continue; if (count > 0u) { for (uint i = 0u; i < count; ++i) { diff --git a/src/core/bvh.cpp b/src/core/bvh.cpp index 2cabf48..5bedb84 100644 --- a/src/core/bvh.cpp +++ b/src/core/bvh.cpp @@ -46,12 +46,52 @@ BVH::~BVH() { clear(); } +void BVH::clear() { + nodes_.clear(); + triangles_.clear(); + triangle_indices_.clear(); +} + bool BVH::build(const std::vector> &meshes) { clear(); ARE_LOG_INFO("Building BVH..."); - // Extract all triangles from meshes + // Step 1: Extract triangles from meshes + extract_triangles_(meshes); + + if (triangles_.empty()) { + ARE_LOG_WARN("No triangles to build BVH"); + return false; + } + + // Step 2: Sort triangles by Morton code for spatial coherence + sort_triangles_by_morton_(); + + // Step 3: Initialize triangle indices (identity mapping after Morton sort) + uint n = static_cast(triangles_.size()); + triangle_indices_.resize(n); + for (uint i = 0; i < n; ++i) { + triangle_indices_[i] = i; + } + + // Step 4: Build BVH top-down using SAH + // Reserve space: worst case 2n-1 nodes for binary tree with 1 tri per leaf + nodes_.reserve(2 * n - 1); + + // Create root node + nodes_.emplace_back(); + + // Build recursively + build_recursive_(0, 0, n); + + ARE_LOG_INFO("BVH built: " + std::to_string(nodes_.size()) + " nodes, " + + std::to_string(triangles_.size()) + " triangles"); + + return true; +} + +void BVH::extract_triangles_(const std::vector> &meshes) { for (const auto &mesh : meshes) { const auto &vertices = mesh->get_vertices(); const auto &indices = mesh->get_indices(); @@ -61,7 +101,6 @@ bool BVH::build(const std::vector> &meshes) { for (size_t i = 0; i < indices.size(); i += 3) { Triangle tri; - // Transform vertices Vec4 v0 = transform * Vec4(vertices[indices[i]].position_, 1.0f); Vec4 v1 = transform * Vec4(vertices[indices[i + 1]].position_, 1.0f); Vec4 v2 = transform * Vec4(vertices[indices[i + 2]].position_, 1.0f); @@ -70,18 +109,15 @@ bool BVH::build(const std::vector> &meshes) { tri.v1_ = Vec3(v1) / v1.w; tri.v2_ = Vec3(v2) / v2.w; - // Transform normals Mat3 normal_matrix = glm::transpose(glm::inverse(Mat3(transform))); tri.n0_ = glm::normalize(normal_matrix * vertices[indices[i]].normal_); tri.n1_ = glm::normalize(normal_matrix * vertices[indices[i + 1]].normal_); tri.n2_ = glm::normalize(normal_matrix * vertices[indices[i + 2]].normal_); - // Transform tangents tri.t0_ = glm::normalize(normal_matrix * vertices[indices[i]].tangent_); tri.t1_ = glm::normalize(normal_matrix * vertices[indices[i + 1]].tangent_); tri.t2_ = glm::normalize(normal_matrix * vertices[indices[i + 2]].tangent_); - // Copy UVs tri.uv0_ = vertices[indices[i]].texcoord_; tri.uv1_ = vertices[indices[i + 1]].texcoord_; tri.uv2_ = vertices[indices[i + 2]].texcoord_; @@ -91,30 +127,74 @@ bool BVH::build(const std::vector> &meshes) { triangles_.push_back(tri); } } +} - if (triangles_.empty()) { - ARE_LOG_WARN("No triangles to build BVH"); - return false; +// Morton code helper: interleave bits +static uint32_t part1by2(uint32_t x) { + x &= 0x000003ffu; + x = (x ^ (x << 16)) & 0xff0000ffu; + x = (x ^ (x << 8)) & 0x0300f00fu; + x = (x ^ (x << 4)) & 0x030c30c3u; + x = (x ^ (x << 2)) & 0x09249249u; + return x; +} + +static uint32_t compute_morton_code(const Vec3 &p, const Vec3 &min, const Vec3 &max) { + Vec3 scale = Vec3(1023.0f) / (max - min + Vec3(1e-6f)); + Vec3 v = (p - min) * scale; + + uint32_t ix = glm::clamp(static_cast(v.x), 0, 1023); + uint32_t iy = glm::clamp(static_cast(v.y), 0, 1023); + uint32_t iz = glm::clamp(static_cast(v.z), 0, 1023); + + return (part1by2(iz) << 2) | (part1by2(iy) << 1) | part1by2(ix); +} + +void BVH::sort_triangles_by_morton_() { + if (triangles_.empty()) + return; + + // Compute scene bounds + AABB scene_bounds; + for (const auto &tri : triangles_) { + scene_bounds.expand(tri.get_bounds()); } - // Initialize triangle indices - triangle_indices_.resize(triangles_.size()); + // Expand bounds slightly + Vec3 padding = (scene_bounds.max_ - scene_bounds.min_) * 0.001f; + scene_bounds.min_ -= padding; + scene_bounds.max_ += padding; + + // Compute Morton codes with indices + struct MortonEntry { + uint32_t code; + size_t original_index; + }; + + std::vector entries; + entries.reserve(triangles_.size()); + for (size_t i = 0; i < triangles_.size(); ++i) { - triangle_indices_[i] = static_cast(i); + uint32_t code = compute_morton_code(triangles_[i].get_centroid(), + scene_bounds.min_, scene_bounds.max_); + entries.push_back({ code, i }); } - // Reserve space for nodes (estimate) - nodes_.reserve(triangles_.size() * 2); + // Sort by Morton code + std::sort(entries.begin(), entries.end(), + [](const MortonEntry &a, const MortonEntry &b) { + return a.code < b.code; + }); - // Create root node - nodes_.emplace_back(); + // Reorder triangles + std::vector sorted_triangles; + sorted_triangles.reserve(triangles_.size()); - // Build BVH recursively - build_recursive_(0, 0, static_cast(triangles_.size())); + for (const auto &entry : entries) { + sorted_triangles.push_back(triangles_[entry.original_index]); + } - ARE_LOG_INFO("BVH built: " + std::to_string(nodes_.size()) + " nodes, " + std::to_string(triangles_.size()) + " triangles"); - - return true; + triangles_ = std::move(sorted_triangles); } void BVH::build_recursive_(uint node_idx, uint first_prim, uint prim_count) { @@ -125,28 +205,10 @@ void BVH::build_recursive_(uint node_idx, uint first_prim, uint prim_count) { node.aabb_min_ = bounds.min_; node.aabb_max_ = bounds.max_; - // Leaf node threshold - const uint LEAF_SIZE = 4; - - if (prim_count <= LEAF_SIZE) { + // Leaf node: 1 triangle per leaf for optimal GPU traversal + if (prim_count <= 1) { node.left_first_ = first_prim; - node.count_ = prim_count; - return; - } - - // Calculate current depth - uint current_depth = 0; - uint idx = node_idx; - while (idx > 0) { - idx = (idx - 1) / 2; - current_depth++; - } - const uint MAX_DEPTH = 32; - - // Force leaf if max depth reached - if (current_depth >= MAX_DEPTH) { - node.left_first_ = first_prim; - node.count_ = prim_count; + node.count_ = 1; return; } @@ -155,22 +217,9 @@ void BVH::build_recursive_(uint node_idx, uint first_prim, uint prim_count) { float split_pos = 0.0f; float split_cost = find_best_split_(first_prim, prim_count, axis, split_pos); - // SAH cost comparison (normalized) - // C_split = C_trav + (N_left * SA_left + N_right * SA_right) / SA_parent - // C_leaf = N * C_int - // With C_trav = 1, C_int = 1: - // Split if C_split < C_leaf - // (Constants are used in find_best_split_ for cost calculation) - + // If SAH says no split is beneficial, force median split + // For GPU ray tracing, deeper trees with 1 tri per leaf are preferred if (split_cost == std::numeric_limits::max() || split_cost >= static_cast(prim_count)) { - // SAH says no split is beneficial, but force split if too many prims - const uint MAX_PRIMS_PER_LEAF = 8; - if (prim_count <= MAX_PRIMS_PER_LEAF) { - node.left_first_ = first_prim; - node.count_ = prim_count; - return; - } - // Force median split as fallback AABB cb = calculate_centroid_bounds_(first_prim, prim_count); for (int a = 0; a < 3; ++a) { float extent = cb.max_[a] - cb.min_[a]; @@ -194,7 +243,7 @@ void BVH::build_recursive_(uint node_idx, uint first_prim, uint prim_count) { } } - // Ensure we have primitives on both sides + // Ensure split produces non-empty partitions if (mid == first_prim || mid == first_prim + prim_count) { mid = first_prim + prim_count / 2; } @@ -203,10 +252,11 @@ void BVH::build_recursive_(uint node_idx, uint first_prim, uint prim_count) { uint left_count = mid - first_prim; uint right_count = prim_count - left_count; + // Store left child index (children will be at left_first_ and left_first_ + 1) node.left_first_ = static_cast(nodes_.size()); - node.count_ = 0; + node.count_ = 0; // Internal node - // Create child nodes + // Create child nodes (contiguous indices) nodes_.emplace_back(); nodes_.emplace_back(); @@ -217,7 +267,8 @@ void BVH::build_recursive_(uint node_idx, uint first_prim, uint prim_count) { float BVH::find_best_split_(uint first_prim, uint prim_count, int &axis, float &split_pos) { float best_cost = std::numeric_limits::max(); - axis = 0, split_pos = 0.0f; + axis = 0; + split_pos = 0.0f; AABB centroid_bounds = calculate_centroid_bounds_(first_prim, prim_count); AABB parent_bounds = calculate_bounds_(first_prim, prim_count); @@ -229,13 +280,12 @@ float BVH::find_best_split_(uint first_prim, uint prim_count, int &axis, float & if (extent < EPSILON) continue; - // Try multiple split positions using 16 bins + // 16-bin SAH const int NUM_BINS = 16; for (int i = 1; i < NUM_BINS; ++i) { float t = static_cast(i) / NUM_BINS; float pos = centroid_bounds.min_[a] + t * extent; - // Count primitives and calculate bounds for each side AABB left_bounds, right_bounds; uint left_count = 0, right_count = 0; @@ -252,13 +302,14 @@ float BVH::find_best_split_(uint first_prim, uint prim_count, int &axis, float & } } - // Calculate normalized SAH cost if (left_count == 0 || right_count == 0) continue; - float cost = 1.0f; // Traversal cost + // SAH cost: C_split = C_trav + (N_left * SA_left + N_right * SA_right) / SA_parent + float cost = 1.0f; if (parent_sa > 0.0f) { - cost += (left_count * left_bounds.surface_area() + right_count * right_bounds.surface_area()) / parent_sa; + cost += (left_count * left_bounds.surface_area() + + right_count * right_bounds.surface_area()) / parent_sa; } if (cost < best_cost) { @@ -338,7 +389,6 @@ bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) { g.uv0_uv1_ = Vec4(t.uv0_.x, t.uv0_.y, t.uv1_.x, t.uv1_.y); g.uv2_ = Vec4(t.uv2_.x, t.uv2_.y, 0.0f, 0.0f); - // Pack tangents g.t0_ = Vec4(t.t0_, 0.0f); g.t1_ = Vec4(t.t1_, 0.0f); @@ -365,10 +415,4 @@ bool BVH::upload_to_gpu(Buffer &node_buffer, Buffer &triangle_buffer) { return true; } -void BVH::clear() { - nodes_.clear(); - triangles_.clear(); - triangle_indices_.clear(); -} - } // namespace are