alpha = 0.6 # 学习速率 gamma = 0.75 # 奖励折扣 episodes = 500 # 游戏盘数 r_history = [] # 奖励值的历史信息 j_history = [] # 步数的历史信息 for i in range(episodes):