perf(solver): cache compiled system across drag steps

During interactive drag, the constraint topology is invariant — only the
dragged part's parameter values change between steps. Previously,
drag_step() called solve() which rebuilt everything from scratch each
frame: new ParamTable, new Expr trees, symbolic differentiation, CSE,
and compilation (~150 ms overhead per frame).

Now pre_drag() builds and caches the system, symbolic Jacobian, compiled
evaluator, half-spaces, and weight vector. drag_step() reuses all cached
artifacts, only updating the dragged part's 7 parameter values before
running Newton-Raphson.

Expected ~1.5-2x speedup on drag step latency (eliminating rebuild
overhead, leaving only the irreducible Newton iteration cost).
This commit is contained in:
forbes-0023
2026-02-21 12:23:32 -06:00
parent e0468cd3c1
commit bfb787157c

View File

@@ -91,6 +91,7 @@ class KindredSolver(kcsolve.IKCSolver):
super().__init__()
self._drag_ctx = None
self._drag_parts = None
self._drag_cache = None
self._limits_warned = False
def name(self):
@@ -244,8 +245,86 @@ class KindredSolver(kcsolve.IKCSolver):
self._drag_ctx = ctx
self._drag_parts = set(drag_parts)
self._drag_step_count = 0
result = self.solve(ctx)
log.info("pre_drag: initial solve status=%s", result.status)
# Build the system once and cache everything for drag_step() reuse.
t0 = time.perf_counter()
system = _build_system(ctx)
half_spaces = compute_half_spaces(
system.constraint_objs,
system.constraint_indices,
system.params,
)
weight_vec = build_weight_vector(system.params)
if half_spaces:
post_step_fn = lambda p: apply_half_space_correction(p, half_spaces)
else:
post_step_fn = None
residuals = substitution_pass(system.all_residuals, system.params)
residuals = single_equation_pass(residuals, system.params)
# Build symbolic Jacobian + compile once
from .codegen import try_compile_system
free = system.params.free_names()
n_res = len(residuals)
n_free = len(free)
jac_exprs = [[r.diff(name).simplify() for name in free] for r in residuals]
compiled_eval = try_compile_system(residuals, jac_exprs, n_res, n_free)
# Initial solve
converged = newton_solve(
residuals,
system.params,
quat_groups=system.quat_groups,
max_iter=100,
tol=1e-10,
post_step=post_step_fn,
weight_vector=weight_vec,
jac_exprs=jac_exprs,
compiled_eval=compiled_eval,
)
if not converged:
converged = bfgs_solve(
residuals,
system.params,
quat_groups=system.quat_groups,
max_iter=200,
tol=1e-10,
weight_vector=weight_vec,
jac_exprs=jac_exprs,
compiled_eval=compiled_eval,
)
# Cache for drag_step() reuse
cache = _DragCache()
cache.system = system
cache.residuals = residuals
cache.jac_exprs = jac_exprs
cache.compiled_eval = compiled_eval
cache.half_spaces = half_spaces
cache.weight_vec = weight_vec
cache.post_step_fn = post_step_fn
self._drag_cache = cache
# Build result
dof = count_dof(residuals, system.params, jac_exprs=jac_exprs)
result = kcsolve.SolveResult()
result.status = (
kcsolve.SolveStatus.Success if converged else kcsolve.SolveStatus.Failed
)
result.dof = dof
result.placements = _extract_placements(system.params, system.bodies)
elapsed = (time.perf_counter() - t0) * 1000
log.info(
"pre_drag: initial solve %s in %.1f ms — dof=%d",
"converged" if converged else "FAILED",
elapsed,
dof,
)
return result
def drag_step(self, drag_placements):
@@ -254,19 +333,73 @@ class KindredSolver(kcsolve.IKCSolver):
log.warning("drag_step: no drag context (pre_drag not called?)")
return kcsolve.SolveResult()
self._drag_step_count = getattr(self, "_drag_step_count", 0) + 1
# Update dragged part placements in ctx (for caller consistency)
for pr in drag_placements:
for part in ctx.parts:
if part.id == pr.id:
part.placement = pr.placement
break
t0 = time.perf_counter()
result = self.solve(ctx)
elapsed = (time.perf_counter() - t0) * 1000
if result.status != kcsolve.SolveStatus.Success:
log.warning(
"drag_step #%d: solve %s in %.1f ms",
cache = getattr(self, "_drag_cache", None)
if cache is None:
# Fallback: no cache, do a full solve
log.debug(
"drag_step #%d: no cache, falling back to full solve",
self._drag_step_count,
)
return self.solve(ctx)
t0 = time.perf_counter()
params = cache.system.params
# Update only the dragged part's 7 parameter values
for pr in drag_placements:
pfx = pr.id + "/"
params.set_value(pfx + "tx", pr.placement.position[0])
params.set_value(pfx + "ty", pr.placement.position[1])
params.set_value(pfx + "tz", pr.placement.position[2])
params.set_value(pfx + "qw", pr.placement.quaternion[0])
params.set_value(pfx + "qx", pr.placement.quaternion[1])
params.set_value(pfx + "qy", pr.placement.quaternion[2])
params.set_value(pfx + "qz", pr.placement.quaternion[3])
# Solve with cached artifacts — no rebuild
converged = newton_solve(
cache.residuals,
params,
quat_groups=cache.system.quat_groups,
max_iter=100,
tol=1e-10,
post_step=cache.post_step_fn,
weight_vector=cache.weight_vec,
jac_exprs=cache.jac_exprs,
compiled_eval=cache.compiled_eval,
)
if not converged:
converged = bfgs_solve(
cache.residuals,
params,
quat_groups=cache.system.quat_groups,
max_iter=200,
tol=1e-10,
weight_vector=cache.weight_vec,
jac_exprs=cache.jac_exprs,
compiled_eval=cache.compiled_eval,
)
result = kcsolve.SolveResult()
result.status = (
kcsolve.SolveStatus.Success if converged else kcsolve.SolveStatus.Failed
)
result.dof = -1 # skip DOF counting during drag for speed
result.placements = _extract_placements(params, cache.system.bodies)
elapsed = (time.perf_counter() - t0) * 1000
if not converged:
log.warning(
"drag_step #%d: solve FAILED in %.1f ms",
self._drag_step_count,
result.status,
elapsed,
)
else:
@@ -283,6 +416,7 @@ class KindredSolver(kcsolve.IKCSolver):
self._drag_ctx = None
self._drag_parts = None
self._drag_step_count = 0
self._drag_cache = None
# ── Diagnostics ─────────────────────────────────────────────────
@@ -300,6 +434,26 @@ class KindredSolver(kcsolve.IKCSolver):
return True
class _DragCache:
"""Cached artifacts from pre_drag() reused across drag_step() calls.
During interactive drag the constraint topology is invariant — only
the dragged part's parameter values change. Caching the built
system, symbolic Jacobian, and compiled evaluator eliminates the
expensive rebuild overhead (~150 ms) on every frame.
"""
__slots__ = (
"system", # _System — owns ParamTable + Expr trees
"residuals", # list[Expr] — after substitution + single-equation pass
"jac_exprs", # list[list[Expr]] — symbolic Jacobian
"compiled_eval", # Callable or None
"half_spaces", # list[HalfSpace]
"weight_vec", # ndarray or None
"post_step_fn", # Callable or None
)
class _System:
"""Intermediate representation of a built constraint system."""