alpha = 0.6 # 学习速率
gamma = 0.75 # 奖励折扣
episodes = 500 # 游戏盘数
r_history = [] # 奖励值的历史信息
j_history = [] # 步数的历史信息
for i in range(episodes):