-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode.py
More file actions
129 lines (108 loc) · 3.8 KB
/
code.py
File metadata and controls
129 lines (108 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""Code.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1JUS6wxTOSpplc0VCqvYGHonksJuBCafF
"""
import numpy as np
import matplotlib.pyplot as plt
class MathEnv:
def __init__(self):
self.state = None
self.solution = None
self.done = False
self.reset()
def reset(self):
self.state = (np.random.randint(0, 10), np.random.randint(0, 10))
self.solution = self.state[0] + self.state[1]
self.done = False
return self.state
def step(self, action):
if action == self.solution:
reward = 1
self.done = True
else:
reward = -1
self.done = True
return self.state, reward, self.done
class QLearningAgent:
def __init__(self, alpha, gamma, epsilon, num_actions, state_size):
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.num_actions = num_actions
self.q_table = np.zeros((state_size, num_actions))
def choose_action(self, state):
if np.random.rand() < self.epsilon:
return np.random.randint(0, self.num_actions)
else:
return np.argmax(self.q_table[state])
def update_q_table(self, state, action, reward, next_state):
old_value = self.q_table[state, action]
next_max = np.max(self.q_table[next_state])
new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
self.q_table[state, action] = new_value
def train_agent(agent, env, num_episodes):
rewards = []
for episode in range(num_episodes):
state = env.reset()
episode_reward = 0
while True:
action = agent.choose_action(state[0]*10 + state[1])
next_state, reward, done = env.step(action)
agent.update_q_table(state[0]*10 + state[1], action, reward, next_state[0]*10 + next_state[1])
state = next_state
episode_reward += reward
if done:
break
rewards.append(episode_reward)
return rewards
def test_agent_with_output(agent, env, num_tests):
total_reward = 0
for _ in range(num_tests):
state = env.reset()
print("Problem:", state[0], "+", state[1], "= ?")
while True:
action = agent.choose_action(state[0]*10 + state[1])
next_state, reward, done = env.step(action)
print("Agent's action:", action)
total_reward += reward
state = next_state
if done:
if reward == 1:
print("Agent's answer is correct!")
else:
print("Agent's answer is incorrect.")
break
return total_reward / num_tests
def plot_rewards(rewards):
plt.figure(figsize=(12, 6))
plt.plot(rewards, label='Average Reward')
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.title('Average Reward per Episode')
plt.legend()
plt.show()
def plot_q_values(agent):
q_values = agent.q_table
num_actions = agent.num_actions
plt.figure(figsize=(12, 8))
for i in range(q_values.shape[1]):
plt.plot(q_values[:, i], label=f'Action {i}')
plt.xlabel('State')
plt.ylabel('Q-value')
plt.title('Q-values for each action in different states')
plt.legend()
plt.show()
def main():
env = MathEnv()
agent = QLearningAgent(alpha=0.1, gamma=0.9, epsilon=0.1, num_actions=20, state_size=100)
num_episodes = 5000
rewards = train_agent(agent, env, num_episodes)
num_tests = 5
average_reward = test_agent_with_output(agent, env, num_tests)
print("Average reward:", average_reward)
plot_rewards(rewards)
plot_q_values(agent)
if __name__ == "__main__":
main()