rank_zero_info("\n\nNote: you are using fp32 (very slow). Try bf16 / tf32 for faster training.\n\n")
out,state=model.forward([187,510],None)
ifargs.precision=="fp16":
out,state=model.forward([1563],state)# RNN has state (use deepcopy to clone states)
rank_zero_info("\n\nNote: you are using fp16 (might overflow). Try bf16 / tf32 for stable training.\n\n")
out,state=model.forward([310,247],state)
print(out.detach().cpu().numpy())# same result as above
os.environ["RWKV_JIT_ON"]="1"
if"deepspeed_stage_3"inargs.strategy:
print('\n')
os.environ["RWKV_JIT_ON"]="0"
fromsrc.utilsimportPIPELINE,PIPELINE_ARGS
torch.backends.cudnn.benchmark=True
pipeline=PIPELINE(model,"20B_tokenizer.json")
torch.backends.cudnn.enabled=True
ifargs.precision=="fp32":
ctx="\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
torch.backends.cudnn.allow_tf32=False
print(ctx,end='')
torch.backends.cuda.matmul.allow_tf32=False
else:
defmy_print(s):
torch.backends.cudnn.allow_tf32=True
print(s,end='',flush=True)
torch.backends.cuda.matmul.allow_tf32=True
# For alpha_frequency and alpha_presence, see "Frequency and presence penalties":