Reinforcement learning with verifiable rewards (RLVR) has shown promise in enhancing the reasoning capabilities of large language models by learning directly from outcome-based rewards. Recent RLVR works that operate under the zero setting avoid supervision in labeling the reasoning process, but still depend on manually curated collections of questions and answers for training.
transformers
and vllm
.input = [1, 2, 3, 4, 5]
, and checks if the expected output (2 + 4 = 6) is clear and testable. The task is valid, so it gets a high learnability reward.def sum_even_numbers(numbers):
return sum(num for num in numbers if num % 2 == 0)
[1, 2, 3, 4, 5]
→ 6, [2, 4, 6]
→ 12
). The solution is correct, so AZR gets a high accuracy reward.conda create -n azr python=3.10
conda activate azr
pip install -r requirements.txt
vllm==0.7.3
and transformers==4.47.1
.data/<new_ded_abd_seed_data_name>.jsonl
).bash scripts/selfplay/7b.sh
python -m absolute_zero_reasoner.utils.convert2hf <veRL_ckpt_path>/actor <veRL_ckpt_path>/actor/huggingface/ <hf_ckpt_path>
azr.reward.generation_reward_config.
White Paper: Absolute Zero: Reinforced Self-play Reasoning with Zero Data
GitHub: Absolute-Zero-Reasoner
Here is a .NET 4.8 Console Application written in C#
namespace AbsoluteZeroReasoner
{
#region Using Statements:
using System;
using System.IO;
using System.Linq;
using Microsoft.CSharp;
using System.Reflection;
using System.CodeDom.Compiler;
using System.Collections.Generic;
#endregion
/// <summary>
/// A reasoning system that trains on reasoning tasks using a language model and reinforcement learning.
/// It generates, solves, and verifies tasks, optimizing for learnability and accuracy.
/// </summary>
public class AbsoluteZeroReasoner
{
#region Fields:
private readonly ILanguageModel _languageModel;
private readonly IReinforcementLearner _rlAgent;
private readonly CodeExecutor _codeExecutor;
private readonly Random _random;
private readonly double _learnabilityRewardWeight;
private readonly double _accuracyRewardWeight;
private readonly int _maxIterations;
#endregion
/// <summary>
/// Initializes a new instance of the <see cref="AbsoluteZeroReasoner"/> class.
/// </summary>
/// <param name="languageModel">The language model used to generate tasks and solutions.</param>
/// <param name="rlAgent">The reinforcement learning agent to track rewards and task history.</param>
/// <param name="learnabilityRewardWeight">Weight for learnability reward in total reward calculation (default: 0.5).</param>
/// <param name="accuracyRewardWeight">Weight for accuracy reward in total reward calculation (default: 0.5).</param>
/// <param name="maxIterations">Maximum number of training iterations (default: 1000).</param>
/// <exception cref="ArgumentNullException">Thrown when <paramref name="languageModel"/> or <paramref name="rlAgent"/> is null.</exception>
public AbsoluteZeroReasoner(
ILanguageModel languageModel,
IReinforcementLearner rlAgent,
double learnabilityRewardWeight = 0.5,
double accuracyRewardWeight = 0.5,
int maxIterations = 1000)
{
_languageModel = languageModel ?? throw new ArgumentNullException(nameof(languageModel));
_rlAgent = rlAgent ?? throw new ArgumentNullException(nameof(rlAgent));
_codeExecutor = new CodeExecutor();
_random = new Random();
_learnabilityRewardWeight = learnabilityRewardWeight;
_accuracyRewardWeight = accuracyRewardWeight;
_maxIterations = maxIterations;
}
/// <summary>
/// Trains the reasoner by processing tasks from a file or generating tasks autonomously.
/// </summary>
/// <param name="taskFilePath">Path to a JSON file containing tasks (optional).</param>
public void Train(string taskFilePath = null)
{
List<ReasoningTask> fileTasks = taskFilePath != null ? LoadTasksFromFile(taskFilePath) : null;
int taskIndex = 0;
for (int iteration = 0; iteration < _maxIterations; iteration++)
{
Console.WriteLine($"\n=== Iteration {iteration + 1} ===");
ReasoningTask task;
double learnabilityReward;
if (fileTasks != null && taskIndex < fileTasks.Count)
{
task = fileTasks[taskIndex];
learnabilityReward = ComputeLearnabilityReward(task);
taskIndex++;
}
else
{
var proposedTask = ProposeTask();
var (proposedTaskObj, proposedLearnability) = ValidateProposedTask(proposedTask);
if (proposedTaskObj == null)
{
Console.WriteLine("Invalid task proposed.");
_rlAgent.Update(learnabilityReward: -1.0, accuracyReward: 0.0, taskId: null);
continue;
}
task = proposedTaskObj;
learnabilityReward = proposedLearnability;
}
Console.WriteLine($"Proposed Task: {task.Question}, Expected Output: {task.ExpectedOutput}");
var solution = SolveTask(task);
Console.WriteLine($"Generated Solution: {solution}");
var (actualOutput, accuracyReward, error) = VerifySolution(task, solution);
Console.WriteLine($"Actual Output: {(error != null ? $"Error: {error}" : actualOutput ?? "null")}");
Console.WriteLine($"Accuracy Reward: {accuracyReward}");
double totalReward = (_learnabilityRewardWeight * learnabilityReward) +
(_accuracyRewardWeight * accuracyReward);
_rlAgent.Update(learnabilityReward, accuracyReward, task.Id);
Console.WriteLine($"Learnability Reward: {learnabilityReward}, Total Reward: {totalReward}");
}
}
/// <summary>
/// Loads reasoning tasks from a JSON file.
/// </summary>
/// <param name="filePath">Path to the JSON file containing tasks.</param>
/// <returns>A list of <see cref="ReasoningTask"/> objects, or null if loading fails.</returns>
private List<ReasoningTask> LoadTasksFromFile(string filePath)
{
if (!File.Exists(filePath))
{
Console.WriteLine($"File {filePath} not found. Falling back to autonomous task generation.");
return null;
}
try
{
string json = File.ReadAllText(filePath);
Console.WriteLine($"Raw JSON: {json.Substring(0, Math.Min(100, json.Length))}..."); // Log JSON snippet
var tasks = new List<ReasoningTask>();
// Split JSON array into individual task objects
string[] taskStrings = json.Trim('[', ']').Split(new[] { "},{" }, StringSplitOptions.None);
foreach (string taskStr in taskStrings)
{
string cleaned = taskStr.Trim('{', '}').Replace("},", "").Replace("{", "");
// Split by commas, but handle quoted strings carefully
var parts = new List<string>();
bool inQuotes = false;
string currentPart = "";
for (int i = 0; i < cleaned.Length; i++)
{
if (cleaned[i] == '"') inQuotes = !inQuotes;
else if (cleaned[i] == ',' && !inQuotes)
{
parts.Add(currentPart.Trim());
currentPart = "";
continue;
}
currentPart += cleaned[i];
}
if (!string.IsNullOrEmpty(currentPart)) parts.Add(currentPart.Trim());
var dict = new Dictionary<string, string>();
foreach (var part in parts)
{
var kv = part.Split(new[] { ":" }, 2, StringSplitOptions.None);
if (kv.Length == 2)
{
string key = kv[0].Trim('"');
string value = kv[1].Trim('"');
if (!dict.ContainsKey(key)) // Avoid duplicate keys
dict[key] = value;
}
}
if (dict.ContainsKey("id") && dict.ContainsKey("question") && dict.ContainsKey("expected_output"))
{
tasks.Add(new ReasoningTask
{
Id = dict["id"],
Question = dict["question"],
ExpectedOutput = dict["expected_output"]
});
}
}
Console.WriteLine($"Loaded {tasks.Count} tasks from {filePath}");
return tasks.Count > 0 ? tasks : null;
}
catch (Exception ex)
{
Console.WriteLine($"Error reading tasks from {filePath}: {ex.Message}. Falling back to autonomous task generation.");
return null;
}
}
/// <summary>
/// Proposes a new reasoning task using the language model.
/// </summary>
/// <returns>A JSON string representing the proposed task.</returns>
private string ProposeTask()
{
string[] taskTypes = { "deduction", "abduction", "induction" };
string taskType = taskTypes[_random.Next(taskTypes.Length)];
string prompt = $"Generate a {taskType} reasoning task in the form of a C# coding problem. " +
"Return the task as a JSON string with 'id' (unique string), 'question' (problem description), and " +
"'expected_output' (expected result).";
return _languageModel.Generate(prompt);
}
/// <summary>
/// Validates a proposed task and computes its learnability reward.
/// </summary>
/// <param name="taskJson">JSON string representing the proposed task.</param>
/// <returns>A tuple containing the parsed <see cref="ReasoningTask"/> and its learnability reward, or (null, -1.0) if invalid.</returns>
private (ReasoningTask Task, double LearnabilityReward) ValidateProposedTask(string taskJson)
{
try
{
var task = ParseTaskJson(taskJson);
if (string.IsNullOrEmpty(task.Id) || string.IsNullOrEmpty(task.Question) || string.IsNullOrEmpty(task.ExpectedOutput))
{
return (null, -1.0);
}
bool isValid = _codeExecutor.IsValidTask(task);
double learnabilityReward = isValid ? ComputeLearnabilityReward(task) : -1.0;
return (isValid ? task : null, learnabilityReward);
}
catch (Exception ex)
{
Console.WriteLine($"Task validation error: {ex.Message}");
return (null, -1.0);
}
}
/// <summary>
/// Generates a solution for a given reasoning task using the language model.
/// </summary>
/// <param name="task">The reasoning task to solve.</param>
/// <returns>The generated C# solution code as a string.</returns>
private string SolveTask(ReasoningTask task)
{
string prompt = $"Solve the following C# coding problem:\n{task.Question}\n" +
"Provide the solution as a C# statement that assigns to a variable.";
return _languageModel.Generate(prompt);
}
/// <summary>
/// Verifies a solution by executing it and comparing its output to the expected output.
/// </summary>
/// <param name="task">The reasoning task being verified.</param>
/// <param name="solution">The generated solution code.</param>
/// <returns>A tuple containing the actual output, accuracy reward (1.0 for correct, -1.0 for incorrect), and any error message.</returns>
private (string ActualOutput, double AccuracyReward, string Error) VerifySolution(ReasoningTask task, string solution)
{
try
{
string actualOutput = _codeExecutor.ExecuteSolution(solution);
string normalizedActual = actualOutput?.Trim().ToLower();
string normalizedExpected = task.ExpectedOutput?.Trim().ToLower().Replace("\"", "");
bool isCorrect = string.Equals(normalizedActual, normalizedExpected);
Console.WriteLine($"Comparing: Actual='{normalizedActual}', Expected='{normalizedExpected}', IsCorrect={isCorrect}");
return (actualOutput, isCorrect ? 1.0 : -1.0, null);
}
catch (Exception ex)
{
return (null, -1.0, ex.Message);
}
}
/// <summary>
/// Computes the learnability reward for a task based on its complexity.
/// </summary>
/// <param name="task">The reasoning task to evaluate.</param>
/// <returns>A learnability reward value between 0.0 and 1.0.</returns>
private double ComputeLearnabilityReward(ReasoningTask task)
{
int complexity = task.Question.Length;
return Math.Min(1.0, complexity / 100.0);
}
/// <summary>
/// Parses a JSON string into a <see cref="ReasoningTask"/> object.
/// </summary>
/// <param name="json">The JSON string to parse.</param>
/// <returns>A <see cref="ReasoningTask"/> object, or an empty task if parsing fails.</returns>
private ReasoningTask ParseTaskJson(string json)
{
try
{
var parts = json.Replace("{", "").Replace("}", "").Split(',');
string id = parts.FirstOrDefault(p => p.Contains("id"))?.Split(':')[1].Trim('"');
string question = parts.FirstOrDefault(p => p.Contains("question"))?.Split(':')[1].Trim('"');
string expectedOutput = parts.FirstOrDefault(p => p.Contains("expected_output"))?.Split(':')[1].Trim('"').Replace("\"", "");
return new ReasoningTask { Id = id, Question = question, ExpectedOutput = expectedOutput };
}
catch
{
return new ReasoningTask();
}
}
}
/// <summary>
/// Represents a reasoning task with an identifier, question, and expected output.
/// </summary>
public class ReasoningTask
{
/// <summary>
/// Gets or sets the unique identifier of the task.
/// </summary>
public string Id { get; set; }
/// <summary>
/// Gets or sets the problem description or question.
/// </summary>
public string Question { get; set; }
/// <summary>
/// Gets or sets the expected output of the task.
/// </summary>
public string ExpectedOutput { get; set; }
}
/// <summary>
/// Defines the interface for a language model that generates tasks and solutions.
/// </summary>
public interface ILanguageModel
{
/// <summary>
/// Generates content based on a given prompt.
/// </summary>
/// <param name="prompt">The input prompt for content generation.</param>
/// <returns>The generated content as a string.</returns>
string Generate(string prompt);
/// <summary>
/// Generates a dataset of tasks and saves it to a file.
/// </summary>
/// <param name="taskCount">The number of tasks to generate.</param>
/// <param name="filePath">The file path to save the dataset.</param>
void GenerateDataset(int taskCount, string filePath);
}
/// <summary>
/// Defines the interface for a reinforcement learning agent that tracks rewards and task history.
/// </summary>
public interface IReinforcementLearner
{
/// <summary>
/// Updates the agent's state with rewards and task information.
/// </summary>
/// <param name="learnabilityReward">The learnability reward for the task.</param>
/// <param name="accuracyReward">The accuracy reward for the task.</param>
/// <param name="taskId">The unique identifier of the task.</param>
void Update(double learnabilityReward, double accuracyReward, string taskId);
/// <summary>
/// Retrieves the task history, including attempts and successes.
/// </summary>
/// <returns>A dictionary mapping task IDs to their attempt and success counts.</returns>
Dictionary<string, (int Attempts, int Successes)> GetTaskHistory();
}
/// <summary>
/// Executes and validates C# code for reasoning tasks.
/// </summary>
public class CodeExecutor
{
/// <summary>
/// Validates whether a reasoning task is well-formed.
/// </summary>
/// <param name="task">The reasoning task to validate.</param>
/// <returns>True if the task is valid; otherwise, false.</returns>
public bool IsValidTask(ReasoningTask task)
{
return !string.IsNullOrEmpty(task.Id) &&
!string.IsNullOrEmpty(task.Question) &&
!string.IsNullOrEmpty(task.ExpectedOutput);
}
/// <summary>
/// Executes a C# solution code and returns its output.
/// </summary>
/// <param name="code">The C# solution code to execute.</param>
/// <returns>The output of the executed code as a string.</returns>
/// <exception cref="Exception">Thrown if compilation or execution fails.</exception>
public string ExecuteSolution(string code)
{
try
{
CSharpCodeProvider provider = new CSharpCodeProvider();
CompilerParameters parameters = new CompilerParameters
{
GenerateInMemory = true,
GenerateExecutable = false
};
parameters.ReferencedAssemblies.Add("System.dll");
string wrappedCode = @"
using System;
public class Solution {
public static string Run() {
" + code + @"
return result.ToString();
}
}";
Console.WriteLine($"Compiling code:\n{wrappedCode}");
CompilerResults results = provider.CompileAssemblyFromSource(parameters, wrappedCode);
if (results.Errors.HasErrors)
{
string errors = string.Join("\n", results.Errors.Cast<CompilerError>().Select(e => e.ToString()));
throw new Exception($"Compilation error: {errors}");
}
var assembly = results.CompiledAssembly;
var type = assembly.GetType("Solution");
var method = type.GetMethod("Run");
var result = method.Invoke(null, null);
return result?.ToString();
}
catch (Exception ex)
{
throw new Exception($"Execution error: {ex.Message}");
}
}
}
/// <summary>
/// A stub implementation of <see cref="ILanguageModel"/> that dynamically generates tasks and solutions.
/// </summary>
public class DynamicStubLanguageModel : ILanguageModel
{
#region Fields:
private readonly Random _random;
private int _taskCounter;
private readonly Dictionary<string, (int Attempts, int Successes)> _taskHistory;
private readonly List<string> _recentTasks;
#endregion
/// <summary>
/// Initializes a new instance of the <see cref="DynamicStubLanguageModel"/> class.
/// </summary>
/// <param name="rlAgent">The reinforcement learning agent to access task history.</param>
public DynamicStubLanguageModel(IReinforcementLearner rlAgent)
{
_random = new Random();
_taskCounter = 0;
_taskHistory = rlAgent.GetTaskHistory();
_recentTasks = new List<string>();
}
/// <summary>
/// Generates content based on a prompt, such as a task or solution.
/// </summary>
/// <param name="prompt">The input prompt for content generation.</param>
/// <returns>The generated content as a string (e.g., JSON task or C# solution).</returns>
public string Generate(string prompt)
{
if (prompt.Contains("Generate a"))
{
_taskCounter++;
string taskId = $"task_{_taskCounter}";
string taskType = prompt.Contains("deduction") ? "deduction" :
prompt.Contains("abduction") ? "abduction" : "induction";
var (question, expectedOutput, _) = GenerateNewTask(taskType);
return $"{{\"id\": \"{taskId}\", \"question\": \"{question} ({taskType})\", \"expected_output\": \"{expectedOutput}\"}}";
}
else if (prompt.Contains("Solve the following"))
{
string question = prompt.Split('\n')[1].Split('(')[0].Trim();
return GenerateSolutionForQuestion(question);
}
return "";
}
/// <summary>
/// Generates a dataset of tasks and saves it to a JSON file.
/// </summary>
/// <param name="taskCount">The number of tasks to generate.</param>
/// <param name="filePath">The file path to save the dataset.</param>
public void GenerateDataset(int taskCount, string filePath)
{
var tasks = new List<Dictionary<string, string>>();
int tempCounter = _taskCounter;
_taskCounter = 0;
for (int i = 0; i < taskCount; i++)
{
_taskCounter++;
string taskId = $"task_{_taskCounter}";
string taskType = new[] { "deduction", "abduction", "induction" }[_random.Next(3)];
var (question, expectedOutput, solution) = GenerateNewTask(taskType);
tasks.Add(new Dictionary<string, string>
{
{ "id", taskId },
{ "question", $"{question} ({taskType})" },
{ "expected_output", expectedOutput },
{ "solution", solution }
});
}
string json = "[" + string.Join(",", tasks.Select(t =>
$"{{ \"id\": \"{t["id"]}\", \"question\": \"{t["question"]}\", \"expected_output\": \"{t["expected_output"]}\", \"solution\": \"{t["solution"]}\" }}"))
+ "]";
File.WriteAllText(filePath, json);
Console.WriteLine($"Generated {taskCount} tasks and saved to {filePath}");
Console.WriteLine($"Generated JSON: {json.Substring(0, Math.Min(100, json.Length))}..."); // Log JSON snippet
_taskCounter = tempCounter;
}
/// <summary>
/// Generates a new reasoning task based on the specified task type.
/// </summary>
/// <param name="taskType">The type of task (deduction, abduction, or induction).</param>
/// <returns>A tuple containing the question, expected output, and solution.</returns>
private (string Question, string ExpectedOutput, string Solution) GenerateNewTask(string taskType)
{
var templates = new List<(string QuestionTemplate, Func<int, int, (string ExpectedOutput, string Solution)>)>
{
("Write a method to compute {0} factorial", (n, _) => {
int fact = 1;
for (int i = 1; i <= n; i++) fact *= i;
return ($"{fact}", $"int result = 1; for (int i = 1; i <= {n}; i++) result *= i;");
}),
("Write a method to compute {0} + {1}", (a, b) => ($"{a + b}", $"int result = {a} + {b};")),
("Write a method to compute the square of {0}", (n, _) => ($"{n * n}", $"int result = {n} * {n};")),
("Write a method to concatenate '{0}' and '{1}'", (a, b) => ($"{a}{b}", $"string result = \"{a}\" + \"{b}\";")),
("Write a method to check if {0} is even", (n, _) => ($"{n % 2 == 0}", $"bool result = {(n % 2 == 0)}"))
};
var weights = templates.Select((t, i) =>
t.QuestionTemplate.Contains("square") ? 2.0 :
t.QuestionTemplate.Contains("is even") ? 1.5 :
1.0 / (1 + _taskHistory.Values.Sum(h => h.Attempts - h.Successes) * 3.0)).ToList();
var availableTemplates = templates.Where(t => !_recentTasks.Contains(t.QuestionTemplate.Split('{')[0])).ToList();
if (!availableTemplates.Any())
{
_recentTasks.Clear();
availableTemplates = templates;
}
int templateIndex = _taskCounter == 10 ? 2 : SelectWeightedIndex(weights, templates);
var template = templates[templateIndex];
int param1 = _taskCounter == 10 ? 5 : _random.Next(1, template.QuestionTemplate.Contains("factorial") ? 7 : 10);
int param2 = _random.Next(1, 100);
string question = string.Format(template.QuestionTemplate, param1, param2);
_recentTasks.Add(template.QuestionTemplate.Split('{')[0]);
if (_recentTasks.Count > 3) _recentTasks.RemoveAt(0);
var (expectedOutput, solution) = template.Item2(param1, param2);
return (question, expectedOutput, solution);
}
/// <summary>
/// Generates a C# solution for a given question.
/// </summary>
/// <param name="question">The task question to solve.</param>
/// <returns>The generated C# solution code as a string.</returns>
private string GenerateSolutionForQuestion(string question)
{
var templates = new List<(string Keyword, Func<string, (string Param1, string Param2, string Solution)>)>
{
("factorial", q => {
string[] parts = q.Split(' ');
string n = parts.FirstOrDefault(p => int.TryParse(p, out _)) ?? _random.Next(1, 7).ToString();
return (n, "0", $"int result = 1; for (int i = 1; i <= {n}; i++) result *= i;");
}),
(" + ", q => {
string[] parts = q.Split('+').Select(p => p.Trim()).ToArray();
string a = parts.Length > 0 && int.TryParse(parts[0].Split(' ').LastOrDefault(p => int.TryParse(p, out _)), out int n) ? n.ToString() : _random.Next(1, 10).ToString();
string b = parts.Length > 1 && int.TryParse(parts[1].Split(' ').FirstOrDefault(p => int.TryParse(p, out _)), out int m) ? m.ToString() : _random.Next(1, 100).ToString();
return (a, b, $"int result = {a} + {b};");
}),
("square of", q => {
string[] parts = q.Split(' ');
string n = parts.LastOrDefault(p => int.TryParse(p, out _)) ?? _random.Next(1, 10).ToString();
if (_taskCounter == 10) {
return (n, "0", $"int result = {n} + {n};"); // Wrong: n + n instead of n * n
}
return (n, "0", $"int result = {n} * {n};");
}),
("concatenate", q => {
string[] parts = q.Split('\'');
string s1 = parts.Length > 1 ? parts[1] : _random.Next(1, 100).ToString();
string s2 = parts.Length > 3 ? parts[3] : _random.Next(1, 100).ToString();
return (s1, s2, $"string result = \"{s1}\" + \"{s2}\";");
}),
("is even", q => {
string[] parts = q.Split(' ');
string n = parts.FirstOrDefault(p => int.TryParse(p, out _)) ?? _random.Next(1, 10).ToString();
return (n, "0", $"bool result = {n} % 2 == 0;");
})
};
foreach (var template in templates)
{
if (question.Contains(template.Keyword))
{
var (param1, param2, solution) = template.Item2(question);
Console.WriteLine($"Parsed: Question='{question}', Param1='{param1}', Param2='{param2}', Solution='{solution}'");
return solution;
}
}
return "int result = 0;";
}
/// <summary>
/// Selects a template index based on weighted probabilities.
/// </summary>
/// <param name="weights">The weights for each template.</param>
/// <param name="templates">The list of task templates.</param>
/// <returns>The selected template index.</returns>
private int SelectWeightedIndex(IList<double> weights, IList<(string, Func<int, int, (string, string)>)> templates)
{
// Force a square task after iteration 10 if task_10 failed
if (_taskCounter > 10 && _taskHistory.ContainsKey("task_10") && _taskHistory["task_10"].Successes == 0)
{
for (int i = 0; i < templates.Count; i++)
{
if (templates[i].Item1.Contains("square")) return i;
}
}
double total = weights.Sum();
double r = _random.NextDouble() * total;
double sum = 0;
for (int i = 0; i < weights.Count; i++)
{
sum += weights[i];
if (r <= sum) return i;
}
return weights.Count - 1;
}
}
/// <summary>
/// A stub implementation of <see cref="IReinforcementLearner"/> that tracks task history and rewards.
/// </summary>
public class DynamicStubReinforcementLearner : IReinforcementLearner
{
#region Fields:
private readonly Dictionary<string, (int Attempts, int Successes)> _taskHistory;
private double _cumulativeReward;
private int _successfulTasks;
#endregion
/// <summary>
/// Initializes a new instance of the <see cref="DynamicStubReinforcementLearner"/> class.
/// </summary>
public DynamicStubReinforcementLearner()
{
_taskHistory = new Dictionary<string, (int, int)>();
_cumulativeReward = 0.0;
_successfulTasks = 0;
}
/// <summary>
/// Updates the agent's state with rewards and task information.
/// </summary>
/// <param name="learnabilityReward">The learnability reward for the task.</param>
/// <param name="accuracyReward">The accuracy reward for the task.</param>
/// <param name="taskId">The unique identifier of the task.</param>
public void Update(double learnabilityReward, double accuracyReward, string taskId)
{
_cumulativeReward += (learnabilityReward + accuracyReward);
if (accuracyReward > 0)
{
_successfulTasks++;
}
if (taskId != null)
{
if (!_taskHistory.ContainsKey(taskId))
{
_taskHistory[taskId] = (0, 0);
}
var (attempts, successes) = _taskHistory[taskId];
_taskHistory[taskId] = (attempts + 1, successes + (accuracyReward > 0 ? 1 : 0));
}
Console.WriteLine($"RL Update: Learnability = {learnabilityReward}, Accuracy = {accuracyReward}, " +
$"Cumulative Reward = {_cumulativeReward}, Successful Tasks = {_successfulTasks}");
Console.WriteLine($"Task History: {string.Join(", ", _taskHistory.Select(kv => $"{kv.Key}: {kv.Value.Successes}/{kv.Value.Attempts}"))}");
}
/// <summary>
/// Retrieves the task history, including attempts and successes.
/// </summary>
/// <returns>A dictionary mapping task IDs to their attempt and success counts.</returns>
public Dictionary<string, (int Attempts, int Successes)> GetTaskHistory()
{
return _taskHistory;
}
}
/// <summary>
/// The entry point for the AbsoluteZeroReasoner application.
/// </summary>
public class Program
{
/// <summary>
/// The main method that initializes and runs the reasoner.
/// </summary>
public static void Main()
{
var rlAgent = new DynamicStubReinforcementLearner();
var languageModel = new DynamicStubLanguageModel(rlAgent);
languageModel.GenerateDataset(100, "tasks.json");
var reasoner = new AbsoluteZeroReasoner(languageModel, rlAgent, maxIterations: 15);
reasoner.Train("tasks.json");
Console.ReadLine();
}
}
}