perf(solver): cache compiled system across drag steps

During interactive drag, the constraint topology is invariant — only the dragged part's parameter values change between steps. Previously, drag_step() called solve() which rebuilt everything from scratch each frame: new ParamTable, new Expr trees, symbolic differentiation, CSE, and compilation (~150 ms overhead per frame). Now pre_drag() builds and caches the system, symbolic Jacobian, compiled evaluator, half-spaces, and weight vector. drag_step() reuses all cached artifacts, only updating the dragged part's 7 parameter values before running Newton-Raphson. Expected ~1.5-2x speedup on drag step latency (eliminating rebuild overhead, leaving only the irreducible Newton iteration cost).
2026-02-21 12:23:32 -06:00
parent e0468cd3c1
commit bfb787157c
1 changed files with 163 additions and 9 deletions
--- a/kindred_solver/solver.py
+++ b/kindred_solver/solver.py
@@ -91,6 +91,7 @@ class KindredSolver(kcsolve.IKCSolver):
        super().__init__()
        self._drag_ctx = None
        self._drag_parts = None
+        self._drag_cache = None
        self._limits_warned = False

    def name(self):
@@ -244,8 +245,86 @@ class KindredSolver(kcsolve.IKCSolver):
        self._drag_ctx = ctx
        self._drag_parts = set(drag_parts)
        self._drag_step_count = 0
-        result = self.solve(ctx)
-        log.info("pre_drag: initial solve status=%s", result.status)
+
+        # Build the system once and cache everything for drag_step() reuse.
+        t0 = time.perf_counter()
+        system = _build_system(ctx)
+
+        half_spaces = compute_half_spaces(
+            system.constraint_objs,
+            system.constraint_indices,
+            system.params,
+        )
+        weight_vec = build_weight_vector(system.params)
+
+        if half_spaces:
+            post_step_fn = lambda p: apply_half_space_correction(p, half_spaces)
+        else:
+            post_step_fn = None
+
+        residuals = substitution_pass(system.all_residuals, system.params)
+        residuals = single_equation_pass(residuals, system.params)
+
+        # Build symbolic Jacobian + compile once
+        from .codegen import try_compile_system
+
+        free = system.params.free_names()
+        n_res = len(residuals)
+        n_free = len(free)
+        jac_exprs = [[r.diff(name).simplify() for name in free] for r in residuals]
+        compiled_eval = try_compile_system(residuals, jac_exprs, n_res, n_free)
+
+        # Initial solve
+        converged = newton_solve(
+            residuals,
+            system.params,
+            quat_groups=system.quat_groups,
+            max_iter=100,
+            tol=1e-10,
+            post_step=post_step_fn,
+            weight_vector=weight_vec,
+            jac_exprs=jac_exprs,
+            compiled_eval=compiled_eval,
+        )
+        if not converged:
+            converged = bfgs_solve(
+                residuals,
+                system.params,
+                quat_groups=system.quat_groups,
+                max_iter=200,
+                tol=1e-10,
+                weight_vector=weight_vec,
+                jac_exprs=jac_exprs,
+                compiled_eval=compiled_eval,
+            )
+
+        # Cache for drag_step() reuse
+        cache = _DragCache()
+        cache.system = system
+        cache.residuals = residuals
+        cache.jac_exprs = jac_exprs
+        cache.compiled_eval = compiled_eval
+        cache.half_spaces = half_spaces
+        cache.weight_vec = weight_vec
+        cache.post_step_fn = post_step_fn
+        self._drag_cache = cache
+
+        # Build result
+        dof = count_dof(residuals, system.params, jac_exprs=jac_exprs)
+        result = kcsolve.SolveResult()
+        result.status = (
+            kcsolve.SolveStatus.Success if converged else kcsolve.SolveStatus.Failed
+        )
+        result.dof = dof
+        result.placements = _extract_placements(system.params, system.bodies)
+
+        elapsed = (time.perf_counter() - t0) * 1000
+        log.info(
+            "pre_drag: initial solve %s in %.1f ms — dof=%d",
+            "converged" if converged else "FAILED",
+            elapsed,
+            dof,
+        )
        return result

    def drag_step(self, drag_placements):
@@ -254,19 +333,73 @@ class KindredSolver(kcsolve.IKCSolver):
            log.warning("drag_step: no drag context (pre_drag not called?)")
            return kcsolve.SolveResult()
        self._drag_step_count = getattr(self, "_drag_step_count", 0) + 1
+
+        # Update dragged part placements in ctx (for caller consistency)
        for pr in drag_placements:
            for part in ctx.parts:
                if part.id == pr.id:
                    part.placement = pr.placement
                    break
-        t0 = time.perf_counter()
-        result = self.solve(ctx)
-        elapsed = (time.perf_counter() - t0) * 1000
-        if result.status != kcsolve.SolveStatus.Success:
-            log.warning(
-                "drag_step #%d: solve %s in %.1f ms",
+
+        cache = getattr(self, "_drag_cache", None)
+        if cache is None:
+            # Fallback: no cache, do a full solve
+            log.debug(
+                "drag_step #%d: no cache, falling back to full solve",
+                self._drag_step_count,
+            )
+            return self.solve(ctx)
+
+        t0 = time.perf_counter()
+        params = cache.system.params
+
+        # Update only the dragged part's 7 parameter values
+        for pr in drag_placements:
+            pfx = pr.id + "/"
+            params.set_value(pfx + "tx", pr.placement.position[0])
+            params.set_value(pfx + "ty", pr.placement.position[1])
+            params.set_value(pfx + "tz", pr.placement.position[2])
+            params.set_value(pfx + "qw", pr.placement.quaternion[0])
+            params.set_value(pfx + "qx", pr.placement.quaternion[1])
+            params.set_value(pfx + "qy", pr.placement.quaternion[2])
+            params.set_value(pfx + "qz", pr.placement.quaternion[3])
+
+        # Solve with cached artifacts — no rebuild
+        converged = newton_solve(
+            cache.residuals,
+            params,
+            quat_groups=cache.system.quat_groups,
+            max_iter=100,
+            tol=1e-10,
+            post_step=cache.post_step_fn,
+            weight_vector=cache.weight_vec,
+            jac_exprs=cache.jac_exprs,
+            compiled_eval=cache.compiled_eval,
+        )
+        if not converged:
+            converged = bfgs_solve(
+                cache.residuals,
+                params,
+                quat_groups=cache.system.quat_groups,
+                max_iter=200,
+                tol=1e-10,
+                weight_vector=cache.weight_vec,
+                jac_exprs=cache.jac_exprs,
+                compiled_eval=cache.compiled_eval,
+            )
+
+        result = kcsolve.SolveResult()
+        result.status = (
+            kcsolve.SolveStatus.Success if converged else kcsolve.SolveStatus.Failed
+        )
+        result.dof = -1  # skip DOF counting during drag for speed
+        result.placements = _extract_placements(params, cache.system.bodies)
+
+        elapsed = (time.perf_counter() - t0) * 1000
+        if not converged:
+            log.warning(
+                "drag_step #%d: solve FAILED in %.1f ms",
                self._drag_step_count,
-                result.status,
                elapsed,
            )
        else:
@@ -283,6 +416,7 @@ class KindredSolver(kcsolve.IKCSolver):
        self._drag_ctx = None
        self._drag_parts = None
        self._drag_step_count = 0
+        self._drag_cache = None

    # ── Diagnostics ─────────────────────────────────────────────────

@@ -300,6 +434,26 @@ class KindredSolver(kcsolve.IKCSolver):
        return True


+class _DragCache:
+    """Cached artifacts from pre_drag() reused across drag_step() calls.
+
+    During interactive drag the constraint topology is invariant — only
+    the dragged part's parameter values change.  Caching the built
+    system, symbolic Jacobian, and compiled evaluator eliminates the
+    expensive rebuild overhead (~150 ms) on every frame.
+    """
+
+    __slots__ = (
+        "system",  # _System — owns ParamTable + Expr trees
+        "residuals",  # list[Expr] — after substitution + single-equation pass
+        "jac_exprs",  # list[list[Expr]] — symbolic Jacobian
+        "compiled_eval",  # Callable or None
+        "half_spaces",  # list[HalfSpace]
+        "weight_vec",  # ndarray or None
+        "post_step_fn",  # Callable or None
+    )
+
+
 class _System:
    """Intermediate representation of a built constraint system."""