-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathactive_alphazero.yaml
More file actions
128 lines (102 loc) · 4.17 KB
/
active_alphazero.yaml
File metadata and controls
128 lines (102 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import torch
import torch.nn as nn
import torch.nn.functional as F
class AlphaNet(nn.Module):
def __init__(self, board_size, action_size):
super(AlphaNet, self).__init__()
self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(64)
# Residual Blocks for "Deep" reasoning
self.res_block = nn.Sequential(
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64)
)
# Policy Head
self.policy_head = nn.Linear(64 * board_size, action_size)
# Value Head
self.value_head = nn.Linear(64 * board_size, 1)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(x + self.res_block(x)) # Residual connection
x = x.view(x.size(0), -1)
policy = F.softmax(self.policy_head(x), dim=1)
value = torch.tanh(self.value_head(x))
return policy, value
def train_alphazero(model, game_env, iterations=100):
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
memory = [] # Buffer for self-play data
for i in range(iterations):
# 1. Self-Play (Data Generation)
state = game_env.reset()
game_history = []
while not game_env.done:
# Use MCTS guided by 'model' to pick move
action, probs = mcts_search(state, model)
game_history.append((state, probs))
state = game_env.step(action)
# 2. Update Memory with Game Outcome (Z)
reward = game_env.reward
for s, p in game_history:
memory.append((s, p, reward))
# 3. Kaizen Step: Incremental Learning
if len(memory) > 500:
batch = random.sample(memory, 64)
loss = update_model(model, optimizer, batch)
print(f"Iteration {i}: Loss {loss:.4f} - System Optimized.")
import torch
import torch.nn as nn
class SEBlock(nn.Module):
"""Squeeze-and-Excitation for channel-wise attention."""
def __init__(self, channels, reduction=16):
super().__init__()
self.fc = nn.Sequential(
nn.Linear(channels, channels // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channels // reduction, channels, bias=False),
nn.Sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = x.view(b, c, -1).mean(dim=2)
y = self.fc(y).view(b, c, 1, 1)
return x * y.expand_as(x)
class ResBlock(nn.Module):
def __init__(self, channels):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(channels)
self.se = SEBlock(channels)
def forward(self, x):
residual = x
out = torch.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out = self.se(out)
return torch.relu(out + residual)
import torch.multiprocessing as mp
class ActiveAlphaZero:
def __init__(self, model):
self.model = model.share_memory() # Share across processes
self.buffer = mp.Queue(maxsize=10000)
def actor_process(self, process_id):
"""Continuous Self-Play: The Kaizen Engine."""
game = GameEnv()
while True:
# Generate high-quality trajectory using MCTS + Current Model
trajectory = self.run_self_play(game)
self.buffer.put(trajectory)
def learner_process(self):
"""Continuous Optimization."""
optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-3, weight_decay=1e-4)
while True:
batch = self.sample_from_buffer()
# Multi-head loss: Policy (CrossEntropy) + Value (MSE)
loss = self.compute_alphazero_loss(batch)
loss.backward()
optimizer.step()
# HUD Update: Output 1080p-ready diagnostic stats
self.update_dev_hud(loss.item())