#!/usr/bin/env python3 """ Warrant Reader Leaderboard — submission runner (v0 stub). This is the public-facing CLI shape. The internals are stubs; the protocol, arguments, and emitted JSONL format are stable and will be wire-compatible with the v1 release. Usage: python run_reader.py \\ --reader hf://meta-llama/Llama-3.1-8B-Instruct \\ --artifact ../artifacts/frozen_retrieval_topK_500q.v1.jsonl \\ --prompt ../artifacts/benchmark_prompt.v1.md \\ --out submissions/llama-3.1-8b.jsonl Reader URI schemes (planned): hf:/// — HuggingFace transformers, local download vllm://:/ — OpenAI-compatible vLLM endpoint openai:// — OpenAI Chat Completions anthropic:// — Anthropic Messages local:///abs/path — local weight directory """ from __future__ import annotations import argparse import json import sys from pathlib import Path PROTOCOL_VERSION = "v0" def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--reader", required=True, help="Reader URI (hf://, vllm://, openai://, anthropic://, local://).") p.add_argument("--artifact", required=True, type=Path, help="Path to frozen_retrieval_topK_500q.v1.jsonl") p.add_argument("--prompt", required=True, type=Path, help="Path to benchmark_prompt.v1.md") p.add_argument("--out", required=True, type=Path, help="Output JSONL of reader answers (one per question).") p.add_argument("--max_new_tokens", type=int, default=256) p.add_argument("--temperature", type=float, default=0.0) p.add_argument("--reader_instructions", default="", help="String to interpolate into <> in the prompt.") p.add_argument("--limit", type=int, default=None, help="Run only the first N questions (debugging).") return p.parse_args() def load_jsonl(path: Path): if not path.exists(): die(f"artifact not found: {path}\nDid you run runner/fetch_artifacts.py?") with path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if line: yield json.loads(line) def die(msg: str, code: int = 1) -> None: print(f"[run_reader] error: {msg}", file=sys.stderr) sys.exit(code) def stub_call_reader(reader_uri: str, prompt: str, *, max_new_tokens: int, temperature: float) -> dict: """STUB. Wire to the appropriate backend in v1. Returns: {"answer": str, "cited_chunks": list[int], "refused": bool, "raw": str} """ raise NotImplementedError( f"reader backend for {reader_uri!r} is not wired yet in this v0 stub. " f"See runner/README.md for the planned reader URI schemes and " f"contact contact@manifoldmemory.ai for early-access wiring." ) def main() -> None: args = parse_args() prompt_template = args.prompt.read_text(encoding="utf-8") args.out.parent.mkdir(parents=True, exist_ok=True) n = 0 with args.out.open("w", encoding="utf-8") as fout: for item in load_jsonl(args.artifact): if args.limit is not None and n >= args.limit: break qid = item.get("qid") qtype = item.get("qtype") chunks = item.get("chunks", []) user_block = "\n".join(f"[{i+1}] {c.get('text','')}" for i, c in enumerate(chunks[:10])) prompt = ( prompt_template .replace("<>", args.reader_instructions) .replace("{question}", item.get("question", "")) .replace("{evidence_block}", user_block) ) try: resp = stub_call_reader( args.reader, prompt, max_new_tokens=args.max_new_tokens, temperature=args.temperature, ) except NotImplementedError as e: die(str(e), code=2) fout.write(json.dumps({ "schema_version": PROTOCOL_VERSION, "qid": qid, "qtype": qtype, "answer": resp["answer"], "cited_chunks": resp["cited_chunks"], "refused": resp["refused"], "raw": resp.get("raw", ""), }) + "\n") n += 1 print(f"[run_reader] wrote {n} answers -> {args.out}") if __name__ == "__main__": main()