diff --git a/crypto/aes/asm/vpaes-ppc.pl b/crypto/aes/asm/vpaes-ppc.pl index 7fda60ed9e4d59d35fc9222a69ac313a397b942d..1759ae9dcff272227a9f68e3c1a545361efe9c93 100644 --- a/crypto/aes/asm/vpaes-ppc.pl +++ b/crypto/aes/asm/vpaes-ppc.pl @@ -337,24 +337,27 @@ Lenc_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key # prepare for unaligned access - vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case - ?vperm $outmask, v7, $outmask, $outperm - lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_encrypt_core + andi. r8, $out, 15 + li r9, 16 + beq Lenc_out_aligned + vperm v0, v0, v0, $outperm # rotate right/left - vsel v1, $outhead, v0, $outmask - vmr $outhead, v0 - stvx v1, 0, $out - addi $out, $out, 15 # 15 is not a typo - ######## + mtctr r9 +Lenc_out_unaligned: + stvebx v0, 0, $out + addi $out, $out, 1 + bdnz Lenc_out_unaligned + b Lenc_done - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out +.align 4 +Lenc_out_aligned: + stvx v0, 0, $out +Lenc_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` @@ -566,24 +569,27 @@ Ldec_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key - vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case - ?vperm $outmask, v7, $outmask, $outperm - lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_decrypt_core + andi. r8, $out, 15 + li r9, 16 + beq Ldec_out_aligned + vperm v0, v0, v0, $outperm # rotate right/left - vsel v1, $outhead, v0, $outmask - vmr $outhead, v0 - stvx v1, 0, $out - addi $out, $out, 15 # 15 is not a typo - ######## + mtctr r9 +Ldec_out_unaligned: + stvebx v0, 0, $out + addi $out, $out, 1 + bdnz Ldec_out_unaligned + b Ldec_done - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out +.align 4 +Ldec_out_aligned: + stvx v0, 0, $out +Ldec_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` @@ -658,11 +664,11 @@ Ldec_entry: $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) and r30, r5, r9 # copy length&-16 + andi. r9, $out, 15 # is $out aligned? mr r5, r6 # copy pointer to key mr r31, r7 # copy pointer to iv - blt Lcbc_abort - cmpwi r8, 0 # test direction li r6, -1 + mcrf cr1, cr0 # put aside $out alignment flag mr r7, r12 # copy vrsave mtspr 256, r6 # preserve all AltiVec registers @@ -672,6 +678,7 @@ Ldec_entry: lvx v25, r9, r31 ?vperm v24, v24, v25, $inpperm + cmpwi r8, 0 # test direction neg r8, $inp # prepare for unaligned access vxor v7, v7, v7 ?lvsl $keyperm, 0, $key @@ -681,13 +688,37 @@ Ldec_entry: lvx $inptail, 0, $inp ?vperm $outmask, v7, $outmask, $outperm addi $inp, $inp, 15 # 15 is not a typo - lvx $outhead, 0, $out beq Lcbc_decrypt bl _vpaes_encrypt_preheat li r0, 16 + beq cr1, Lcbc_enc_loop # $out is aligned + + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vxor v0, v0, v24 # ^= iv + + bl _vpaes_encrypt_core + + andi. r8, $out, 15 + vmr v24, v0 # put aside iv + sub r9, $out, r8 + vperm $outhead, v0, v0, $outperm # rotate right/left + +Lcbc_enc_head: + stvebx $outhead, r8, r9 + cmpwi r8, 15 + addi r8, r8, 1 + bne Lcbc_enc_head + + sub. r30, r30, r0 # len -= 16 + addi $out, $out, 16 + beq Lcbc_unaligned_done + Lcbc_enc_loop: vmr v0, $inptail lvx $inptail, 0, $inp @@ -713,6 +744,32 @@ Lcbc_decrypt: bl _vpaes_decrypt_preheat li r0, 16 + beq cr1, Lcbc_dec_loop # $out is aligned + + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vmr v25, v0 # put aside input + + bl _vpaes_decrypt_core + + andi. r8, $out, 15 + vxor v0, v0, v24 # ^= iv + vmr v24, v25 + sub r9, $out, r8 + vperm $outhead, v0, v0, $outperm # rotate right/left + +Lcbc_dec_head: + stvebx $outhead, r8, r9 + cmpwi r8, 15 + addi r8, r8, 1 + bne Lcbc_dec_head + + sub. r30, r30, r0 # len -= 16 + addi $out, $out, 16 + beq Lcbc_unaligned_done + Lcbc_dec_loop: vmr v0, $inptail lvx $inptail, 0, $inp @@ -733,23 +790,29 @@ Lcbc_dec_loop: bne Lcbc_dec_loop Lcbc_done: - addi $out, $out, -1 - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out - + beq cr1, Lcbc_write_iv # $out is aligned + +Lcbc_unaligned_done: + andi. r8, $out, 15 + sub $out, $out, r8 + li r9, 0 +Lcbc_tail: + stvebx $outhead, r9, $out + addi r9, r9, 1 + cmpw r9, r8 + bne Lcbc_tail + +Lcbc_write_iv: neg r8, r31 # write [potentially unaligned] iv + li r10, 4 ?lvsl $outperm, 0, r8 - li r6, 15 - vnor $outmask, v7, v7 # 0xff..ff - ?vperm $outmask, v7, $outmask, $outperm - lvx $outhead, 0, r31 + li r11, 8 + li r12, 12 vperm v24, v24, v24, $outperm # rotate right/left - vsel v0, $outhead, v24, $outmask - lvx v1, r6, r31 - stvx v0, 0, r31 - vsel v1, v24, v1, $outmask - stvx v1, r6, r31 + stvewx v24, 0, r31 # ivp is at least 32-bit aligned + stvewx v24, r10, r31 + stvewx v24, r11, r31 + stvewx v24, r12, r31 mtspr 256, r7 # restore vrsave li r10,`15+6*$SIZE_T` @@ -872,18 +935,21 @@ _vpaes_schedule_core: # encrypting, output zeroth round key after transform li r8, 0x30 # mov \$0x30,%r8d - addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + li r9, 4 + li r10, 8 + li r11, 12 ?lvsr $outperm, 0, $out # prepare for unaligned access vnor $outmask, v9, v9 # 0xff..ff - lvx $outhead, 0, $out ?vperm $outmask, v9, $outmask, $outperm #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) - vperm v1, v0, v0, $outperm # rotate right/left - vsel v2, $outhead, v1, $outmask - vmr $outhead, v1 - stvx v2, 0, $out + vperm $outhead, v0, v0, $outperm # rotate right/left + stvewx $outhead, 0, $out # some are superfluous + stvewx $outhead, r9, $out + stvewx $outhead, r10, $out + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + stvewx $outhead, r11, $out b Lschedule_go Lschedule_am_decrypting: @@ -893,20 +959,24 @@ Lschedule_am_decrypting: addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 # decrypting, output zeroth round key after shiftrows lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 + li r9, 4 + li r10, 8 + li r11, 12 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 neg r0, $out # prepare for unaligned access ?lvsl $outperm, 0, r0 - addi $out, $out, 15 # 15 is not typo vnor $outmask, v9, v9 # 0xff..ff - lvx $outhead, 0, $out ?vperm $outmask, $outmask, v9, $outperm #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v4, v4, v4, $outperm # rotate right/left - vsel v2, $outhead, v4, $outmask - vmr $outhead, v4 - stvx v2, 0, $out + vperm $outhead, v4, v4, $outperm # rotate right/left + stvewx $outhead, 0, $out # some are superfluous + stvewx $outhead, r9, $out + stvewx $outhead, r10, $out + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + stvewx $outhead, r11, $out + addi $out, $out, 15 # 15 is not typo xori r8, r8, 0x30 # xor \$0x30, %r8 Lschedule_go: @@ -1038,14 +1108,15 @@ Lschedule_mangle_last: #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key vperm v0, v0, v0, $outperm # rotate right/left + li r10, 4 vsel v2, $outhead, v0, $outmask - vmr $outhead, v0 + li r11, 8 stvx v2, 0, $out - - addi $out, $out, 15 # 15 is not typo - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out + li r12, 12 + stvewx v0, 0, $out # some (or all) are redundant + stvewx v0, r10, $out + stvewx v0, r11, $out + stvewx v0, r12, $out b Lschedule_mangle_done .align 4 @@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec: bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key + addi r9, $out, -15 # -15 is not typo vperm v0, v0, v0, $outperm # rotate right/left + li r10, 4 vsel v2, $outhead, v0, $outmask - vmr $outhead, v0 + li r11, 8 stvx v2, 0, $out + li r12, 12 + stvewx v0, 0, r9 # some (or all) are redundant + stvewx v0, r10, r9 + stvewx v0, r11, r9 + stvewx v0, r12, r9 - addi $out, $out, -15 # -15 is not typo - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out Lschedule_mangle_done: mtlr r7