Source code for milp_flare.flare

import dataclasses
import json
import re
import shutil
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from milp_flare._assets import LEAN_DIR
from milp_flare._prompts import render_flare_agent_prompt
from milp_flare.harness import Harness

#: Axioms permitted in a verified reformulation proof. All of Mathlib is built on
#: these three; a proof depending on anything else is not trusted by FLARE.
STANDARD_AXIOMS = frozenset({"propext", "Classical.choice", "Quot.sound"})



[docs]
@dataclass(frozen=True)
class FormulationInput:
    """Formulation input for the FLARE agent.

    Attributes
    ----------
    formulation_md : str
        Markdown description of the formulation. Typically produced by
        ``Formulation.render_markdown()`` from :fb:`/api/formulation.html`.
    solve_py : str
        Python solver script with Gurobi implementation of the formulation.
        :class:`FLARE` does not execute this script; it is only used as an additional
        reference for the agent. Typically produced by ``Formulation.gen_solve_py()``
        from :fb:`/api/formulation.html`.

    Examples
    --------

    Construct a ``FormulationInput`` from formulation ``p1.a`` from
    :fb:`/problems/p1.html`::

        >>> from formulation_bench import Dataset
        >>> from milp_flare import FormulationInput
        >>> ds = Dataset.load()
        >>> a = ds.problems[1].formulations["a"]
        >>> inp = FormulationInput(a.render_markdown(), a.gen_solve_py())

        >>> print(inp.formulation_md)
        # Amusement Park Ticket Machines
        <BLANKLINE>
        ## Problem Description
        <BLANKLINE>
        An amusement park is installing cash-based machines and card-only machines...

        >>> print(inp.solve_py)
        import json
        from gurobipy import Model, GRB
        import argparse
        <BLANKLINE>
        <BLANKLINE>
        def main(params_path: str, solution_path: str) -> None:
        ...
    """

    formulation_md: str
    solve_py: str




[docs]
@dataclass
class FLAREResult:
    """Result from FLARE.

    Attributes
    ----------
    is_reformulation : bool
        Final verdict. True if all sub-checks pass: both ``Formulation.lean``
        files compile, ``Reformulation.lean`` compiles and contains a
        ``def : MILPReformulation``, and the proof is ``sorry``-free using
        only the :data:`STANDARD_AXIOMS`.
    duration_s : float, optional
        Wall-clock duration of the agent run, in seconds.
    cost_usd : float, optional
        Estimated USD cost of the agent run. ``None`` when the harness
        does not report cost.
    metadata : dict[str, Any]
        Per-check breakdown and harness run metadata: ``form_a_written``,
        ``form_a_compiled``, ``form_b_written``, ``form_b_compiled``,
        ``proof_compiled``, ``milp_reform_found``, ``sorry_free``,
        ``no_new_axioms``, ``axioms``, ``agent_decision``, and token counts.
    """

    is_reformulation: bool
    duration_s: float | None = None
    cost_usd: float | None = None
    metadata: dict[str, Any] = field(default_factory=dict)




[docs]
class FLARE:
    """Agentic MILP reformulation verifier.

    FLARE (Formulation-Level Automated Reformulation Evaluation) uses an LLM-based
    agent and the Lean proof assistant to verify MILP reformulations according to
    the :fb:`/definitions.html` definition of reformulation. See the
    :paper:`/` for more details.

    Parameters
    ----------
    harness : Harness
        The agent harness to use for auto-formalization and proof synthesis.
        See :ref:`agent-harnesses` for the available harnesses.

    Attributes
    ----------
    harness : Harness
        The configured agent harness.

    Examples
    --------

    Use FLARE to verify if formulation ``b`` of problem ``p1`` from
    :fb:`/problems/p1.html` is a reformulation of formulation ``a`` (also see
    :doc:`/user_guide/run_flare`):

    .. code-block:: python

        from pathlib import Path
        from formulation_bench import Dataset
        from milp_flare import FLARE, FormulationInput
        from milp_flare.harness import ClaudeCodeHarness

        ds = Dataset.load()
        a = ds.problems[1].formulations["a"]
        b = ds.problems[1].formulations["b"]

        harness = ClaudeCodeHarness(model="claude-opus-4-7")
        flare = FLARE(harness=harness)
        result = flare.verify(
            FormulationInput(a.render_markdown(), a.gen_solve_py()),
            FormulationInput(b.render_markdown(), b.gen_solve_py()),
            output_path=Path("runs/p1_a_b"),
        )
    """

    def __init__(self, harness: Harness) -> None:
        self.harness = harness


[docs]
    def get_config_dict(self) -> dict[str, Any]:
        """Return a dictionary with the FLARE configuration.

        Returns
        -------
        config : dict[str, Any]
            Harness, image, model, and reasoning configuration. Forwarded
            directly from :meth:`Harness.get_config_dict`.
        """
        return self.harness.get_config_dict()



[docs]
    def verify(
        self,
        a: FormulationInput,
        b: FormulationInput,
        output_path: Path,
    ) -> FLAREResult:
        """Run FLARE on a pair of MILP formulations.

        Run FLARE to determine if formulation ``b`` is a reformulation of
        formulation ``a`` according to the :fb:`/definitions.html`
        definition of reformulation. This method creates the agent working
        directory (see below), triggers the agent, and evaluates the
        agent output. Finally, it populates ``output_path`` with:

        - The agent working directory (``wd/``)
        - The FLARE configuration (``config.json``)
        - The result dictionary (``result.json``)

        The agent working directory contains descriptions of each formulation,
        the agent prompt ``prompt.txt`` (see :doc:`/prompts`), and the necessary
        Lake environment files.

        .. code-block::

            wd/
            ├── A/
            │   ├── formulation.md
            │   ├── solve.py
            │   └── Formulation.lean   # written by agent
            ├── B/
            │   ├── formulation.md
            │   ├── solve.py
            │   └── Formulation.lean   # written by agent
            ├── Reformulation.lean     # written by agent
            ├── prompt.txt
            ├── Common.lean
            ├── lake-manifest.json
            ├── lakefile.toml
            └── lean-toolchain

        Parameters
        ----------
        a : FormulationInput
            Inputs for formulation A.
        b : FormulationInput
            Inputs for formulation B (the candidate reformulation of A).
        output_path : pathlib.Path
            Directory to populate with run artifacts.

        Returns
        -------
        result : FLAREResult
            The verdict, duration, cost, and per-check metadata.

        Examples
        --------

        Run FLARE on formulations ``a`` and ``b`` and inspect the result:

        .. code-block:: python

            flare = FLARE(harness=harness)
            result = flare.verify(a, b, output_path=Path("runs/a_b"))
            result.is_reformulation
            True
            result.metadata["form_a_written"]
            True
            result.cost_usd
            1.49
            result.duration_s
            322

        See :doc:`/user_guide/run_flare` for more example usage.
        """
        # Create the artifacts directory at the output path and write config
        artifacts_dir = output_path
        artifacts_dir.mkdir(parents=True, exist_ok=True)
        (artifacts_dir / "config.json").write_text(
            json.dumps(self.get_config_dict(), indent=2)
        )

        # Setup the agent's working directory
        wd = artifacts_dir / "wd"
        self._setup_wd(wd, a, b)

        # Run the agent harness
        run_result = self.harness.run(wd)

        # Evaluate the agent's output to obtain final result and write metadata
        meta = self._evaluate(wd)
        meta.update(dataclasses.asdict(run_result))
        (artifacts_dir / "result.json").write_text(json.dumps(meta, indent=2))

        return FLAREResult(
            is_reformulation=meta["is_reformulation"],
            duration_s=meta.get("duration_s"),
            cost_usd=meta.get("cost_usd"),
            metadata=meta,
        )


    def _setup_wd(
        self,
        wd: Path,
        a: FormulationInput,
        b: FormulationInput,
    ) -> None:
        """Populate the agent working directory with all necessary files."""

        wd.mkdir(parents=True, exist_ok=True)

        # Setup Lean environment configuration
        self._setup_lake(wd)

        # Populate problem files for Formulation A and B
        for label, inp in [("A", a), ("B", b)]:
            form_dir = wd / label
            form_dir.mkdir(exist_ok=True)
            (form_dir / "formulation.md").write_text(inp.formulation_md)
            (form_dir / "solve.py").write_text(inp.solve_py)
            (form_dir / "Formulation.lean").write_text("")

        # Create empty Reformulation.lean
        (wd / "Reformulation.lean").write_text("")

        # Write the agent prompt
        # For Docker harnesses, this allows agent scripts to access the prompt
        # in the container (see milp_flare/assets/scripts/*.sh).
        (wd / "prompt.txt").write_text(render_flare_agent_prompt())

        # Do any harness-specific configuration (e.g., agent.sh, MCP config, skills).
        self.harness.configure_wd(wd)

    def _setup_lake(self, wd: Path) -> None:
        """Setup minimal Lake environment so the agent can compile Lean files."""
        for src in LEAN_DIR.iterdir():
            shutil.copy2(src, wd / src.name)

    def _evaluate(self, wd: Path) -> dict[str, Any]:
        """Evaluate the agent's output to determine if the reformulation is correct."""

        # Expected agent output files
        form_a_lean = wd / "A" / "Formulation.lean"
        form_b_lean = wd / "B" / "Formulation.lean"
        reform_file = wd / "Reformulation.lean"

        # Check if the agent wrote the expected files
        form_a_written = form_a_lean.exists() and form_a_lean.stat().st_size > 0
        form_b_written = form_b_lean.exists() and form_b_lean.stat().st_size > 0
        reform_content = reform_file.read_text().strip() if reform_file.exists() else ""
        proof_written = bool(reform_content)

        # Check for agent self-reported non-reformulation decisions
        decision = self._check_agent_decision(reform_content)
        if decision is not None:
            return {
                "is_reformulation": False,
                "agent_decision": decision,
                "agent_reason": reform_content,
                "form_a_written": form_a_written,
                "form_b_written": form_b_written,
                "form_a_compiled": None,
                "form_b_compiled": None,
                "proof_compiled": None,
                "milp_reform_found": None,
                "sorry_free": None,
                "no_new_axioms": None,
                "axioms": None,
            }

        # Load the agent's compile results and log
        result_path = wd / "result.json"
        compile_log_path = wd / "compile_log.txt"
        entry_result: dict[str, Any] = {}
        if result_path.exists():
            try:
                entry_result = json.loads(result_path.read_text())
            except json.JSONDecodeError:
                entry_result = {}
        compile_log = compile_log_path.read_text() if compile_log_path.exists() else ""

        # Generate compilation status for each file
        form_a_compiled = (
            form_a_written and entry_result.get("form_a_compile_exit") == 0
        )
        form_b_compiled = (
            form_b_written and entry_result.get("form_b_compile_exit") == 0
        )
        proof_compiled = proof_written and entry_result.get("compile_exit") == 0

        # Check if Reformulation.lean contains a MILPReformulation def
        milp_reform_found = bool(
            re.search(r"\bdef\s+\w+\s*:\s*MILPReformulation\b", reform_content)
        )

        # Check if `sorry` is used in the reformulation proof
        sorry_free = (
            "uses `sorry`" not in compile_log
            if (proof_compiled and milp_reform_found)
            else False
        )

        # Check the reformulation proof's axiom dependencies
        no_new_axioms, axioms = self._check_axioms(compile_log)
        no_new_axioms = (
            no_new_axioms if (proof_compiled and milp_reform_found) else False
        )

        is_reformulation = (
            form_a_compiled
            and form_b_compiled
            and proof_compiled
            and milp_reform_found
            and sorry_free
            and no_new_axioms
        )

        return {
            "is_reformulation": is_reformulation,
            "agent_decision": "reformulation" if is_reformulation else "failed",
            "form_a_written": form_a_written,
            "form_b_written": form_b_written,
            "form_a_compiled": form_a_compiled,
            "form_b_compiled": form_b_compiled,
            "proof_compiled": proof_compiled,
            "milp_reform_found": milp_reform_found,
            "sorry_free": sorry_free,
            "no_new_axioms": no_new_axioms,
            "axioms": axioms,
        }

    def _check_agent_decision(self, reform_content: str) -> str | None:
        """Check for agent self-reported non-reformulation decisions"""
        first_line = next(
            (line.strip() for line in reform_content.splitlines() if line.strip()),
            "",
        )
        normalized = re.sub(r"^-+\s*", "", first_line).upper()
        if normalized.startswith("NOT REFORMULATION"):
            return "not_reformulation"
        if normalized.startswith("MCP_UNAVAILABLE"):
            return "mcp_unavailable"
        return None

    def _check_axioms(self, compile_log: str) -> tuple[bool, list[str]]:
        """Parse ``#print axioms`` output and check it against the allowlist."""
        match = re.search(r"depends on axioms: \[([^\]]*)\]", compile_log)
        if match is None:
            if "does not depend on any axioms" in compile_log:
                return True, []
            return False, []
        axioms = [a.strip() for a in match.group(1).split(",") if a.strip()]
        no_new_axioms = all(a in STANDARD_AXIOMS for a in axioms)
        return no_new_axioms, axioms