// deno-lint-ignore-file no-unversioned-import no-import-prefix
import ollama from "npm:ollama";
import { SYSTEM_PROMPT } from "./system_prompt.ts";
import { QUESTIONS } from "./questions.ts";
import { get_vram, get_vram_auto } from "./vram.ts";
// const LOG_FULL = Deno.args.includes('--full-output');
// region Helpers
let questions_answered = 0;
let SYSTEM_VRAM: number;
let first_answer_time: number | null = null;
let end_timeout: number | null = null;
const first_run = () => {
if(end_timeout == null) {
console.log('OLLAMA initialized. Starting benchmark timer of 5 minutes');
console.log('');
end_timeout = setTimeout(end_benchmark, 1000 * 60 * 5);
}
if(first_answer_time == null) {
console.log(`Running through list of ${QUESTIONS.length + 1} questions`);
console.log('');
first_answer_time = Date.now();
}
}
const end_benchmark = () => {
const time_taken_seconds = (Date.now() - first_answer_time!) / 1000;
const qpm = questions_answered / (time_taken_seconds / 60);
console.log(`\n\nBenchmark completed. Answered %c${questions_answered}%c questions in %c${time_taken_seconds.toFixed(2)}%c seconds.`, 'color: orange', '', 'color: orange', '');
console.log(`Questions per second: ${qpm}`);
console.log('');
console.log('This benchmark only determines the performance of a single LLM instance. Each instance runs at full speed with 8GB of VRAM.');
console.log(`The score of the benchmark on this system will be ${(qpm * SYSTEM_VRAM).toFixed(2)} (Assuming VRAM = ${SYSTEM_VRAM})`);
console.log('%cScore = average questions per minute, including thinking and responding time', 'color: gray');
console.log('%cScore is raw LLM power, and does not use external tools for more accurate benchmark metrics', 'color: gray');
Deno.exit(0);
}
const run_benchmark = async () => {
let message_shown = false;
SYSTEM_VRAM = (await get_vram_auto())!;
while(!SYSTEM_VRAM) {
console.clear();
console.log(`Steward LLM Benchmark v0.1\n`);
if(!message_shown) {
console.log('Could not automatically get system VRAM information.');
message_shown = true;
}
SYSTEM_VRAM = get_vram()!;
}
console.log('');
while(QUESTIONS.length) await next_question();
end_benchmark();
}
// region LLM Setup
const LLM_MODEL = 'qwen3:8b';
enum QuestionState {
STARTED,
THINKING,
RESPONDING,
DONE,
}
let step_start = 0;
const next_question = async () => {
const question = QUESTIONS.shift()!;
const stream = await ollama.chat({
model: LLM_MODEL,
messages: [{
role: 'system',
content: SYSTEM_PROMPT
}, {
role: 'user',
content: question
}],
tools: [],
think: true,
keep_alive: 0,
stream: true,
options: {
seed: 0,
temperature: 0,
}
})
first_run();
let state: QuestionState = QuestionState.STARTED;
let doneThinking = false;
const encoder = new TextEncoder();
if(questions_answered > 0) {
const step_time = Date.now() - step_start;
Deno.stdout.write(encoder.encode(` DONE (${(step_time/1000).toFixed(2)} seconds total)\n`))
}
console.log(`Q#${(questions_answered+1).toString().padStart(3, '0')}: ${question}`);
for await (const chunk of stream) {
if(chunk.message.thinking) {
if(state !== QuestionState.THINKING) {
step_start = Date.now();
Deno.stdout.write(encoder.encode(" '- THINKING... "));
// if(LOG_FULL) console.log('');
state = QuestionState.THINKING;
}
// if(LOG_FULL) Deno.stdout.write(encoder.encode(chunk.message.thinking));
}
if(chunk.message.content) {
if(state !== QuestionState.RESPONDING) {
if(state == QuestionState.THINKING) {
const step_time = Date.now() - step_start;
Deno.stdout.write(encoder.encode(` DONE (${(step_time/1000).toFixed(2)} seconds elapsed)\n`))
}
Deno.stdout.write(encoder.encode(" '- RESPONDING... "));
// if(LOG_FULL) console.log('');
state = QuestionState.RESPONDING;
}
if(!doneThinking) {
doneThinking = true;
state = QuestionState.RESPONDING;
}
// if(LOG_FULL) Deno.stdout.write(encoder.encode(chunk.message.content));
}
}
if(state == QuestionState.RESPONDING) {
const step_time = Date.now() - step_start;
Deno.stdout.write(encoder.encode(`DONE (${(step_time/1000).toFixed(2)} seconds elapsed)\n`))
}
state = QuestionState.DONE;
questions_answered++;
Deno.stdout.write(encoder.encode(" '- CLEANING... "));
}
// region Run
await run_benchmark();