Mini-SWE-Agent

GPT-5-Mini

adaptive-rejection-sampler

5

4

80.0%

bn-fit-modify

5

4

80.0%

break-filter-js-from-html

5

0

0.0%

build-cython-ext

5

0

0.0%

build-pmars

5

2

40.0%

build-pov-ray

5

0

0.0%

caffe-cifar-10

5

0

0.0%

cancel-async-tasks

5

1

20.0%

chess-best-move

5

0

0.0%

circuit-fibsqrt

5

0

0.0%

cobol-modernization

5

4

80.0%

code-from-image

5

1

20.0%

compile-compcert

5

0

0.0%

configure-git-webserver

5

2

40.0%

constraints-scheduling

5

5

100.0%

count-dataset-tokens

5

0

0.0%

crack-7z-hash

5

0

0.0%

custom-memory-heap-crash

5

4

80.0%

db-wal-recovery

5

1

20.0%

distribution-search

5

3

60.0%

dna-assembly

5

0

0.0%

dna-insert

5

0

0.0%

extract-elf

5

4

80.0%

extract-moves-from-video

5

0

0.0%

feal-differential-cryptanalysis

5

0

0.0%

feal-linear-cryptanalysis

5

3

60.0%

filter-js-from-html

5

0

0.0%

financial-document-processor

5

0

0.0%

fix-code-vulnerability

5

1

20.0%

fix-git

5

5

100.0%

fix-ocaml-gc

5

0

0.0%

gcode-to-text

5

0

0.0%

git-leak-recovery

5

3

60.0%

git-multibranch

5

5

100.0%

gpt2-codegolf

5

0

0.0%

headless-terminal

5

4

80.0%

hf-model-inference

5

4

80.0%

install-windows-3.11

5

0

0.0%

kv-store-grpc

5

2

40.0%

large-scale-text-editing

5

2

40.0%

largest-eigenval

5

0

0.0%

llm-inference-batching-scheduler

5

0

0.0%

log-summary-date-ranges

5

0

0.0%

mailman

5

0

0.0%

make-doom-for-mips

5

0

0.0%

make-mips-interpreter

5

0

0.0%

mcmc-sampling-stan

5

0

0.0%

merge-diff-arc-agi-task

5

0

0.0%

model-extraction-relu-logits

5

0

0.0%

modernize-scientific-stack

5

2

40.0%

mteb-leaderboard

5

0

0.0%

mteb-retrieve

5

0

0.0%

multi-source-data-merger

5

4

80.0%

nginx-request-logging

5

2

40.0%

openssl-selfsigned-cert

5

1

20.0%

overfull-hbox

5

0

0.0%

password-recovery

5

0

0.0%

path-tracing

5

0

0.0%

path-tracing-reverse

5

0

0.0%

polyglot-c-py

5

0

0.0%

polyglot-rust-c

5

0

0.0%

portfolio-optimization

5

4

80.0%

protein-assembly

5

0

0.0%

prove-plus-comm

5

5

100.0%

pypi-server

5

1

20.0%

pytorch-model-cli

5

0

0.0%

pytorch-model-recovery

5

3

60.0%

qemu-alpine-ssh

5

0

0.0%

qemu-startup

5

3

60.0%

query-optimize

5

0

0.0%

raman-fitting

5

0

0.0%

regex-chess

5

0

0.0%

regex-log

5

4

80.0%

reshard-c4-data

5

0

0.0%

rstan-to-pystan

5

0

0.0%

sam-cell-seg

5

0

0.0%

sanitize-git-repo

5

1

20.0%

schemelike-metacircular-eval

5

0

0.0%

sparql-university

5

0

0.0%

sqlite-db-truncate

5

0

0.0%

sqlite-with-gcov

5

0

0.0%

torch-pipeline-parallelism

5

0

0.0%

torch-tensor-parallelism

5

0

0.0%

train-fasttext

5

0

0.0%

tune-mjcf

5

1

20.0%

video-processing

5

0

0.0%

vulnerable-secret

5

4

80.0%

winning-avg-corewars

5

0

0.0%

write-compressor

5

0

0.0%

Task-level performance breakdown showing resolution rates across all trials.