提交 003f5698 编写于 作者: C Christoph Müllner 提交者: Hugo Landau

riscv: GCM: Provide a Zvbb/Zvbc-based implementation

The RISC-V vector crypto extensions features a Zvbc extension
that provides a carryless multiplication ('vclmul.vv') instruction.
This patch provides an implementation that utilizes this
extension if available.

Tested on QEMU and no regressions observed.
Signed-off-by: NChristoph Müllner <christoph.muellner@vrull.eu>
Reviewed-by: NTomas Mraz <tomas@openssl.org>
Reviewed-by: NPaul Dale <pauli@openssl.org>
Reviewed-by: NHugo Landau <hlandau@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/21923)
上级 cdea6719
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - RV64I
# - RISC-V vector ('V') with VLEN >= 128
# - Vector Bit-manipulation used in Cryptography ('Zvbb')
# - Vector Carryless Multiplication ('Zvbc')
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
################################################################################
# void gcm_init_rv64i_zvbb_zvbc(u128 Htable[16], const u64 H[2]);
#
# input: H: 128-bit H - secret parameter E(K, 0^128)
# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvbb_zvbc and
# gcm_ghash_rv64i_zvbb_zvbc
{
my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zvbb_zvbc
.type gcm_init_rv64i_zvbb_zvbc,\@function
gcm_init_rv64i_zvbb_zvbc:
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $H, $H, 8
li $TMP0, -8
li $TMP1, 63
la $TMP2, Lpolymod
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0
@{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2)
# Shift one left and get the carry bits.
@{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1
@{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1
# Use the fact that the polynomial degree is no more than 128,
# i.e. only the LSB of the upper half could be set.
# Thanks to this we don't need to do the full reduction here.
# Instead simply subtract the reduction polynomial.
# This idea was taken from x86 ghash implementation in OpenSSL.
@{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
@{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t
# Need to set the mask to 3, if the carry bit is set.
@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
@{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0
@{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0
@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
@{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t
@{[vse64_v $V1, $Htable]} # vse64.v v1, (a0)
ret
.size gcm_init_rv64i_zvbb_zvbc,.-gcm_init_rv64i_zvbb_zvbc
___
}
################################################################################
# void gcm_gmult_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16]);
#
# input: Xi: current hash value
# Htable: preprocessed H
# output: Xi: next hash value Xi = (Xi * H mod f)
{
my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
$code .= <<___;
.text
.p2align 3
.globl gcm_gmult_rv64i_zvbb_zvbc
.type gcm_gmult_rv64i_zvbb_zvbc,\@function
gcm_gmult_rv64i_zvbb_zvbc:
ld $TMP0, ($Htable)
ld $TMP1, 8($Htable)
li $TMP2, 63
la $TMP3, Lpolymod
ld $TMP3, 8($TMP3)
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $Xi, $Xi, 8
li $TMP4, -8
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4
@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
# Multiplication
# Do two 64x64 multiplications in one go to save some time
# and simplify things.
# A = a1a0 (t1, t0)
# B = b1b0 (v5)
# C = c1c0 (256 bit)
# c1 = a1b1 + (a0b1)h + (a1b0)h
# c0 = a0b0 + (a0b1)l + (a1b0)h
# v1 = (a0b1)l,(a0b0)l
@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
# v3 = (a0b1)h,(a0b0)h
@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
# v4 = (a1b1)l,(a1b0)l
@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
# v2 = (a1b1)h,(a1b0)h
@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
# Is there a better way to do this?
# Would need to swap the order of elements within a vector register.
@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
# v2 += (a0b1)h
@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
# v2 += (a1b1)l
@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
# v1 += (a0b0)h,0
@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
# v1 += (a1b0)l,0
@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
# Now the 256bit product should be stored in (v2,v1)
# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
# Reduction
# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
# This is a slight variation of the Gueron's Montgomery reduction.
# The difference being the order of some operations has been changed,
# to make a better use of vclmul(h) instructions.
# First step:
# c1 += (c0 * P)l
# vmv.v.i v0, 2
@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# Second step:
# D = d1,d0 is final result
# We want:
# m1 = c1 + (c1 * P)h
# m0 = (c1 * P)l + (c0 * P)h + c0
# d1 = c3 + m1
# d0 = c2 + m0
#v3 = (c1 * P)l, 0
@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
#v4 = (c1 * P)h, (c0 * P)h
@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# XOR in the upper upper part of the product
@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
@{[vrev8_v $V2, $V2]} # vrev8.v v2, v2
@{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4
ret
.size gcm_gmult_rv64i_zvbb_zvbc,.-gcm_gmult_rv64i_zvbb_zvbc
___
}
################################################################################
# void gcm_ghash_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
#
# input: Xi: current hash value
# Htable: preprocessed H
# inp: pointer to input data
# len: length of input data in bytes (mutiple of block size)
# output: Xi: Xi+1 (next hash value Xi)
{
my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");
$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zvbb_zvbc
.type gcm_ghash_rv64i_zvbb_zvbc,\@function
gcm_ghash_rv64i_zvbb_zvbc:
ld $TMP0, ($Htable)
ld $TMP1, 8($Htable)
li $TMP2, 63
la $TMP3, Lpolymod
ld $TMP3, 8($TMP3)
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $Xi, $Xi, 8
add $inp, $inp, 8
li $M8, -8
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4
Lstep:
# Read input data
@{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2)
add $inp, $inp, 16
add $len, $len, -16
# XOR them into Xi
@{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1
@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
# Multiplication
# Do two 64x64 multiplications in one go to save some time
# and simplify things.
# A = a1a0 (t1, t0)
# B = b1b0 (v5)
# C = c1c0 (256 bit)
# c1 = a1b1 + (a0b1)h + (a1b0)h
# c0 = a0b0 + (a0b1)l + (a1b0)h
# v1 = (a0b1)l,(a0b0)l
@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
# v3 = (a0b1)h,(a0b0)h
@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
# v4 = (a1b1)l,(a1b0)l
@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
# v2 = (a1b1)h,(a1b0)h
@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
# Is there a better way to do this?
# Would need to swap the order of elements within a vector register.
@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
# v2 += (a0b1)h
@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
# v2 += (a1b1)l
@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
# v1 += (a0b0)h,0
@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
# v1 += (a1b0)l,0
@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
# Now the 256bit product should be stored in (v2,v1)
# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
# Reduction
# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
# This is a slight variation of the Gueron's Montgomery reduction.
# The difference being the order of some operations has been changed,
# to make a better use of vclmul(h) instructions.
# First step:
# c1 += (c0 * P)l
# vmv.v.i v0, 2
@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# Second step:
# D = d1,d0 is final result
# We want:
# m1 = c1 + (c1 * P)h
# m0 = (c1 * P)l + (c0 * P)h + c0
# d1 = c3 + m1
# d0 = c2 + m0
#v3 = (c1 * P)l, 0
@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
#v4 = (c1 * P)h, (c0 * P)h
@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# XOR in the upper upper part of the product
@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
@{[vrev8_v $V5, $V2]} # vrev8.v v2, v2
bnez $len, Lstep
@{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4
ret
.size gcm_ghash_rv64i_zvbb_zvbc,.-gcm_ghash_rv64i_zvbb_zvbc
___
}
$code .= <<___;
.p2align 4
Lpolymod:
.dword 0x0000000000000001
.dword 0xc200000000000000
.size Lpolymod,.-Lpolymod
___
print $code;
close STDOUT or die "error closing STDOUT: $!";
......@@ -43,7 +43,7 @@ IF[{- !$disabled{asm} -}]
$MODESASM_c64xplus=ghash-c64xplus.s
$MODESDEF_c64xplus=GHASH_ASM
$MODESASM_riscv64=ghash-riscv64.s
$MODESASM_riscv64=ghash-riscv64.s ghash-riscv64-zvbb-zvbc.s
$MODESDEF_riscv64=GHASH_ASM
# Now that we have defined all the arch specific variables, use the
......@@ -91,3 +91,4 @@ GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl
INCLUDE[ghash-s390x.o]=..
GENERATE[ghash-c64xplus.S]=asm/ghash-c64xplus.pl
GENERATE[ghash-riscv64.s]=asm/ghash-riscv64.pl
GENERATE[ghash-riscv64-zvbb-zvbc.s]=asm/ghash-riscv64-zvbb-zvbc.pl
......@@ -413,6 +413,11 @@ void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
const u8 *inp, size_t len);
void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
const u8 *inp, size_t len);
/* Zvbb/Zvbc (vector crypto with vclmul) based routines. */
void gcm_init_rv64i_zvbb_zvbc(u128 Htable[16], const u64 Xi[2]);
void gcm_gmult_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16]);
void gcm_ghash_rv64i_zvbb_zvbc(u64 Xi[2], const u128 Htable[16],
const u8 *inp, size_t len);
# endif
#endif
......@@ -512,7 +517,11 @@ static void gcm_get_funcs(struct gcm_funcs_st *ctx)
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
if (RISCV_HAS_ZBC()) {
if (RISCV_HAS_ZVBB() && RISCV_HAS_ZVBC() && riscv_vlen() >= 128) {
ctx->ginit = gcm_init_rv64i_zvbb_zvbc;
ctx->gmult = gcm_gmult_rv64i_zvbb_zvbc;
ctx->ghash = gcm_ghash_rv64i_zvbb_zvbc;
} else if (RISCV_HAS_ZBC()) {
if (RISCV_HAS_ZBKB()) {
ctx->ginit = gcm_init_rv64i_zbc__zbkb;
ctx->gmult = gcm_gmult_rv64i_zbc__zbkb;
......
......@@ -77,6 +77,29 @@ sub read_reg {
return $1;
}
my @vregs = map("v$_",(0..31));
my %vreglookup;
@vreglookup{@vregs} = @vregs;
sub read_vreg {
my $vreg = lc shift;
if (!exists($vreglookup{$vreg})) {
my $trace = "";
if ($have_stacktrace) {
$trace = Devel::StackTrace->new->as_string;
}
die("Unknown vector register ".$vreg."\n".$trace);
}
if (!($vreg =~ /^v([0-9]+)$/)) {
my $trace = "";
if ($have_stacktrace) {
$trace = Devel::StackTrace->new->as_string;
}
die("Could not process vector register ".$vreg."\n".$trace);
}
return $1;
}
# Helper functions
sub brev8_rv64i {
......@@ -256,4 +279,183 @@ sub rev8 {
return ".word ".($template | ($rs << 15) | ($rd << 7));
}
# Vector instructions
sub vle64_v {
# vle64.v vd, (rs1)
my $template = 0b0000001_00000_00000_111_00000_0000111;
my $vd = read_vreg shift;
my $rs1 = read_reg shift;
return ".word ".($template | ($rs1 << 15) | ($vd << 7));
}
sub vlse64_v {
# vlse64.v vd, (rs1), rs2
my $template = 0b0000101_00000_00000_111_00000_0000111;
my $vd = read_vreg shift;
my $rs1 = read_reg shift;
my $rs2 = read_reg shift;
return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($vd << 7));
}
sub vmerge_vim {
# vmerge.vim vd, vs2, imm, v0
my $template = 0b0101110_00000_00000_011_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $imm = shift;
return ".word ".($template | ($vs2 << 20) | ($imm << 15) | ($vd << 7));
}
sub vmv_v_i {
# vmv.v.i vd, imm
my $template = 0b0101111_00000_00000_011_00000_1010111;
my $vd = read_vreg shift;
my $imm = shift;
return ".word ".($template | ($imm << 15) | ($vd << 7));
}
sub vmv_v_v {
# vmv.v.v vd, vs1
my $template = 0b0101111_00000_00000_000_00000_1010111;
my $vd = read_vreg shift;
my $vs1 = read_vreg shift;
return ".word ".($template | ($vs1 << 15) | ($vd << 7));
}
sub vor_vv_v0t {
# vor.vv vd, vs2, vs1, v0.t
my $template = 0b0010100_00000_00000_000_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $vs1 = read_vreg shift;
return ".word ".($template | ($vs2 << 20) | ($vs1 << 15) | ($vd << 7));
}
sub vse64_v {
# vse64.v vd, (rs1)
my $template = 0b0000001_00000_00000_111_00000_0100111;
my $vd = read_vreg shift;
my $rs1 = read_reg shift;
return ".word ".($template | ($rs1 << 15) | ($vd << 7));
}
sub vsetivli__x0_2_e64_m1_tu_mu {
# vsetivli x0, 2, e64, m1, tu, mu
return ".word 0xc1817057";
}
sub vslidedown_vi {
# vslidedown.vi vd, vs2, uimm
my $template = 0b0011111_00000_00000_011_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $uimm = shift;
return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
}
sub vslideup_vi_v0t {
# vslideup.vi vd, vs2, uimm, v0.t
my $template = 0b0011100_00000_00000_011_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $uimm = shift;
return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
}
sub vslideup_vi {
# vslideup.vi vd, vs2, uimm
my $template = 0b0011101_00000_00000_011_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $uimm = shift;
return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
}
sub vsll_vi {
# vsll.vi vd, vs2, uimm, vm
my $template = 0b1001011_00000_00000_011_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $uimm = shift;
return ".word ".($template | ($vs2 << 20) | ($uimm << 15) | ($vd << 7));
}
sub vsrl_vx {
# vsrl.vx vd, vs2, rs1
my $template = 0b1010001_00000_00000_100_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $rs1 = read_reg shift;
return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
}
sub vsse64_v {
# vsse64.v vs3, (rs1), rs2
my $template = 0b0000101_00000_00000_111_00000_0100111;
my $vs3 = read_vreg shift;
my $rs1 = read_reg shift;
my $rs2 = read_reg shift;
return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($vs3 << 7));
}
sub vxor_vv_v0t {
# vxor.vv vd, vs2, vs1, v0.t
my $template = 0b0010110_00000_00000_000_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $vs1 = read_vreg shift;
return ".word ".($template | ($vs2 << 20) | ($vs1 << 15) | ($vd << 7));
}
sub vxor_vv {
# vxor.vv vd, vs2, vs1
my $template = 0b0010111_00000_00000_000_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $vs1 = read_vreg shift;
return ".word ".($template | ($vs2 << 20) | ($vs1 << 15) | ($vd << 7));
}
# Vector crypto instructions
## Zvbb instructions
sub vrev8_v {
# vrev8.v vd, vs2
my $template = 0b0100101_00000_01001_010_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
return ".word ".($template | ($vs2 << 20) | ($vd << 7));
}
## Zvbc instructions
sub vclmulh_vx {
# vclmulh.vx vd, vs2, rs1
my $template = 0b0011011_00000_00000_110_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $rs1 = read_reg shift;
return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
}
sub vclmul_vx_v0t {
# vclmul.vx vd, vs2, rs1, v0.t
my $template = 0b0011000_00000_00000_110_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $rs1 = read_reg shift;
return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
}
sub vclmul_vx {
# vclmul.vx vd, vs2, rs1
my $template = 0b0011001_00000_00000_110_00000_1010111;
my $vd = read_vreg shift;
my $vs2 = read_vreg shift;
my $rs1 = read_reg shift;
return ".word ".($template | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7));
}
1;
......@@ -33,6 +33,8 @@ RISCV_DEFINE_CAP(ZKSH, 0, 11)
RISCV_DEFINE_CAP(ZKR, 0, 12)
RISCV_DEFINE_CAP(ZKT, 0, 13)
RISCV_DEFINE_CAP(V, 0, 14)
RISCV_DEFINE_CAP(ZVBB, 0, 15)
RISCV_DEFINE_CAP(ZVBC, 0, 16)
/*
* In the future ...
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册