diff --git a/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp
index 1591f347cbe90f35b64ee796553d78a2899ac104..24bc4415e586e08a7cfacccec87d9d45f0d1a4c0 100644
--- a/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp
+++ b/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp
@@ -1578,6 +1578,18 @@ bool MyICJI::runWithErrorTrap(void (*function)(void*), void* param)
     return RunWithErrorTrap(function, param);
 }
 
+// Ideally we'd just use the copies of this in standardmacros.h
+// however, superpmi is missing various other dependencies as well
+static size_t ALIGN_UP_SPMI(size_t val, size_t alignment)
+{
+    return (val + (alignment - 1)) & ~(alignment - 1);
+}
+
+static void* ALIGN_UP_SPMI(void* val, size_t alignment)
+{
+    return (void*)ALIGN_UP_SPMI((size_t)val, alignment);
+}
+
 // get a block of memory for the code, readonly data, and read-write data
 void MyICJI::allocMem(ULONG              hotCodeSize,   /* IN */
                       ULONG              coldCodeSize,  /* IN */
@@ -1590,13 +1602,46 @@ void MyICJI::allocMem(ULONG              hotCodeSize,   /* IN */
                       )
 {
     jitInstance->mc->cr->AddCall("allocMem");
-    // TODO-Cleanup: investigate if we need to check roDataBlock as well. Could hot block size be ever 0?
+
+    // TODO-Cleanup: Could hot block size be ever 0?
     *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize);
+
     if (coldCodeSize > 0)
         *coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize);
     else
         *coldCodeBlock = nullptr;
-    *roDataBlock       = jitInstance->mc->cr->allocateMemory(roDataSize);
+
+    if (roDataSize > 0)
+    {
+        size_t roDataAlignment   = sizeof(void*);
+        size_t roDataAlignedSize = static_cast<size_t>(roDataSize);
+
+        if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN) != 0)
+        {
+            roDataAlignment = 32;
+        }
+        else if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN) != 0)
+        {
+            roDataAlignment = 16;
+        }
+        else if (roDataSize >= 8)
+        {
+            roDataAlignment = 8;
+        }
+
+        // We need to round the roDataSize up to the alignment size and then
+        // overallocate by at most alignment - sizeof(void*) to ensure that
+        // we can offset roDataBlock to be an aligned address and that the
+        // allocation contains at least the originally requested size after
+
+        roDataAlignedSize = ALIGN_UP_SPMI(roDataAlignedSize, roDataAlignment);
+        roDataAlignedSize = roDataAlignedSize + (roDataAlignment - sizeof(void*));
+        *roDataBlock = jitInstance->mc->cr->allocateMemory(roDataAlignedSize);
+        *roDataBlock = ALIGN_UP_SPMI(*roDataBlock, roDataAlignment);
+    }
+    else
+        *roDataBlock = nullptr;
+
     jitInstance->mc->cr->recAllocMem(hotCodeSize, coldCodeSize, roDataSize, xcptnsCount, flag, hotCodeBlock,
                                      coldCodeBlock, roDataBlock);
 }
diff --git a/src/coreclr/src/jit/emit.cpp b/src/coreclr/src/jit/emit.cpp
index f0fd875006c9747fa53a7c52b8f9c5b778479555..8b9c2c7e21fe87b7e5938f61580b738aef15503b 100644
--- a/src/coreclr/src/jit/emit.cpp
+++ b/src/coreclr/src/jit/emit.cpp
@@ -5339,10 +5339,11 @@ UNATIVE_OFFSET emitter::emitDataGenBeg(UNATIVE_OFFSET size, bool align)
     {
         // Data can have any size but since alignment is deduced from the size there's no
         // way to have a larger data size (e.g. 128) and request 4/8/16 byte alignment.
-        // 32 bytes (and more) alignment requires VM support (see ICorJitInfo::allocMem).
-        assert(size <= 16);
+        // As such, we restrict data above 16 bytes to be a multiple of 16 and assume 16-byte
+        // alignment. Alignment greater than 16 requires VM support (see ICorJitInfo::allocMem).
+        assert((size <= 16) || ((size % 16) == 0));
 
-        if (size == 16)
+        if (size >= 16)
         {
             emitConsDsc.align16 = true;
         }
diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp
index 269324d9a9d4f92eacc07b45bbc985ed984e28b0..af753945e3d2feb2a4600934cbb8d16721a4d398 100644
--- a/src/coreclr/src/jit/emitxarch.cpp
+++ b/src/coreclr/src/jit/emitxarch.cpp
@@ -10765,7 +10765,8 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         }
 
         // Check that the offset is properly aligned (i.e. the ddd in [ddd])
-        assert((emitChkAlign == false) || (ins == INS_lea) || (((size_t)addr & (byteSize - 1)) == 0));
+        assert((emitChkAlign == false) || (ins == INS_lea) ||
+               ((byteSize < 16) && (((size_t)addr & (byteSize - 1)) == 0)) || (((size_t)addr & (16 - 1)) == 0));
     }
     else
     {
diff --git a/src/coreclr/src/jit/hwintrinsic.cpp b/src/coreclr/src/jit/hwintrinsic.cpp
index 674b7f9946e0dc89f4149d670abd9060bdb7b518..8f33c4851771bc44637fcc0646c24cfd5c1815c0 100644
--- a/src/coreclr/src/jit/hwintrinsic.cpp
+++ b/src/coreclr/src/jit/hwintrinsic.cpp
@@ -292,16 +292,19 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, va
 // lookupId: Gets the NamedIntrinsic for a given method name and InstructionSet
 //
 // Arguments:
+//    comp       -- The compiler
+//    sig        -- The signature of the intrinsic
 //    className  -- The name of the class associated with the HWIntrinsic to lookup
 //    methodName -- The name of the method associated with the HWIntrinsic to lookup
 //    enclosingClassName -- The name of the enclosing class of X64 classes
 //
 // Return Value:
 //    The NamedIntrinsic associated with methodName and isa
-NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler*   comp,
-                                         const char* className,
-                                         const char* methodName,
-                                         const char* enclosingClassName)
+NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler*         comp,
+                                         CORINFO_SIG_INFO* sig,
+                                         const char*       className,
+                                         const char*       methodName,
+                                         const char*       enclosingClassName)
 {
     // TODO-Throughput: replace sequential search by binary search
     CORINFO_InstructionSet isa = lookupIsa(className, enclosingClassName);
@@ -324,14 +327,23 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler*   comp,
 
     for (int i = 0; i < (NI_HW_INTRINSIC_END - NI_HW_INTRINSIC_START - 1); i++)
     {
+        const HWIntrinsicInfo& intrinsicInfo = hwIntrinsicInfoArray[i];
+
         if (isa != hwIntrinsicInfoArray[i].isa)
         {
             continue;
         }
 
-        if (strcmp(methodName, hwIntrinsicInfoArray[i].name) == 0)
+        int numArgs = static_cast<unsigned>(intrinsicInfo.numArgs);
+
+        if ((numArgs != -1) && (sig->numArgs != static_cast<unsigned>(intrinsicInfo.numArgs)))
+        {
+            continue;
+        }
+
+        if (strcmp(methodName, intrinsicInfo.name) == 0)
         {
-            return hwIntrinsicInfoArray[i].id;
+            return intrinsicInfo.id;
         }
     }
 
diff --git a/src/coreclr/src/jit/hwintrinsic.h b/src/coreclr/src/jit/hwintrinsic.h
index b3c00f45d9809733d48422bae7a1bdcc66e2a393..7e5fe7905d45422af49bc8434393b3bd31273a8e 100644
--- a/src/coreclr/src/jit/hwintrinsic.h
+++ b/src/coreclr/src/jit/hwintrinsic.h
@@ -258,10 +258,11 @@ struct HWIntrinsicInfo
 
     static const HWIntrinsicInfo& lookup(NamedIntrinsic id);
 
-    static NamedIntrinsic lookupId(Compiler*   comp,
-                                   const char* className,
-                                   const char* methodName,
-                                   const char* enclosingClassName);
+    static NamedIntrinsic lookupId(Compiler*         comp,
+                                   CORINFO_SIG_INFO* sig,
+                                   const char*       className,
+                                   const char*       methodName,
+                                   const char*       enclosingClassName);
     static CORINFO_InstructionSet lookupIsa(const char* className, const char* enclosingClassName);
 
     static unsigned lookupSimdSize(Compiler* comp, NamedIntrinsic id, CORINFO_SIG_INFO* sig);
diff --git a/src/coreclr/src/jit/hwintrinsiclistxarch.h b/src/coreclr/src/jit/hwintrinsiclistxarch.h
index 4be0cdadd809c7ba1f812c2ab17f02502328e67d..db2bacbbb64575442cd03458e8ecbb6a4e1f055e 100644
--- a/src/coreclr/src/jit/hwintrinsiclistxarch.h
+++ b/src/coreclr/src/jit/hwintrinsiclistxarch.h
@@ -43,6 +43,7 @@ HARDWARE_INTRINSIC(Vector128,       AsVector2,
 HARDWARE_INTRINSIC(Vector128,       AsVector3,                                  16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       AsVector4,                                  16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       AsVector128,                                16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector128,       Create,                                     16,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       CreateScalarUnsafe,                         16,             1,      {INS_mov_i2xmm,         INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_movss,              INS_movsdsse2},         HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 // The instruction generated for float/double depends on which ISAs are supported
 HARDWARE_INTRINSIC(Vector128,       get_AllBitsSet,                             16,             0,      {INS_pcmpeqd,           INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_cmpps,              INS_cmppd},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
@@ -76,6 +77,7 @@ HARDWARE_INTRINSIC(Vector256,       AsVector256,
 HARDWARE_INTRINSIC(Vector256,       get_AllBitsSet,                             32,             0,      {INS_pcmpeqd,           INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_cmpps,              INS_cmppd},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       get_Count,                                  32,             0,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       get_Zero,                                   32,             0,      {INS_xorps,             INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector256,       Create,                                     32,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       CreateScalarUnsafe,                         32,             1,      {INS_mov_i2xmm,         INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_movss,              INS_movsdsse2},         HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       GetElement,                                 32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector256,       WithElement,                                32,             3,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
diff --git a/src/coreclr/src/jit/hwintrinsicxarch.cpp b/src/coreclr/src/jit/hwintrinsicxarch.cpp
index 010cd5fa5063d591d74421ddb8f72b6f598241a0..1ed4dd771615197948582a7433e3c851cc57a6cb 100644
--- a/src/coreclr/src/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/src/jit/hwintrinsicxarch.cpp
@@ -521,6 +521,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic        intrinsic,
 {
     GenTree* retNode = nullptr;
     GenTree* op1     = nullptr;
+    GenTree* op2     = nullptr;
 
     if (!featureSIMD)
     {
@@ -747,6 +748,70 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic        intrinsic,
             break;
         }
 
+        case NI_Vector128_Create:
+        case NI_Vector256_Create:
+        {
+#if defined(TARGET_X86)
+            if (varTypeIsLong(baseType))
+            {
+                // TODO-XARCH-CQ: It may be beneficial to emit the movq
+                // instruction, which takes a 64-bit memory address and
+                // works on 32-bit x86 systems.
+                break;
+            }
+#endif // TARGET_X86
+
+            // We shouldn't handle this as an intrinsic if the
+            // respective ISAs have been disabled by the user.
+
+            if (intrinsic == NI_Vector256_Create)
+            {
+                if (!compExactlyDependsOn(InstructionSet_AVX))
+                {
+                    break;
+                }
+            }
+            else if (baseType == TYP_FLOAT)
+            {
+                if (!compExactlyDependsOn(InstructionSet_SSE))
+                {
+                    break;
+                }
+            }
+            else if (!compExactlyDependsOn(InstructionSet_SSE2))
+            {
+                break;
+            }
+
+            if (sig->numArgs == 1)
+            {
+                op1     = impPopStack().val;
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+            }
+            else if (sig->numArgs == 2)
+            {
+                op2     = impPopStack().val;
+                op1     = impPopStack().val;
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
+            }
+            else
+            {
+                assert(sig->numArgs >= 3);
+
+                GenTreeArgList* tmp = nullptr;
+
+                for (unsigned i = 0; i < sig->numArgs; i++)
+                {
+                    tmp        = gtNewArgList(impPopStack().val);
+                    tmp->gtOp2 = op1;
+                    op1        = tmp;
+                }
+
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+            }
+            break;
+        }
+
         case NI_Vector128_CreateScalarUnsafe:
         {
             assert(sig->numArgs == 1);
diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp
index aae168c337883e45b07d2d0a8c7227162627cc95..52501b346e928b67110752812c1f964a92991d5e 100644
--- a/src/coreclr/src/jit/importer.cpp
+++ b/src/coreclr/src/jit/importer.cpp
@@ -4504,7 +4504,10 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
 
         if ((namespaceName[0] == '\0') || (strcmp(namespaceName, platformNamespaceName) == 0))
         {
-            result = HWIntrinsicInfo::lookupId(this, className, methodName, enclosingClassName);
+            CORINFO_SIG_INFO sig;
+            info.compCompHnd->getMethodSig(method, &sig);
+
+            result = HWIntrinsicInfo::lookupId(this, &sig, className, methodName, enclosingClassName);
         }
         else if (strcmp(methodName, "get_IsSupported") == 0)
         {
diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h
index 9c79ede16869a484ba87f58e9221832b1654e8ba..2fa26918e30f4f201caeea6aa10c34267dac16ad 100644
--- a/src/coreclr/src/jit/lower.h
+++ b/src/coreclr/src/jit/lower.h
@@ -315,6 +315,7 @@ private:
 #ifdef FEATURE_HW_INTRINSICS
     void LowerHWIntrinsic(GenTreeHWIntrinsic* node);
     void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
+    void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
     void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
 #endif // FEATURE_HW_INTRINSICS
 
diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp
index dc5ed420cf19fb5364c71200d8989e0068ccb694..14672d16aaaac60017c65a0be56db92dfbe803e1 100644
--- a/src/coreclr/src/jit/lowerxarch.cpp
+++ b/src/coreclr/src/jit/lowerxarch.cpp
@@ -927,8 +927,26 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
         node->gtType = TYP_SIMD16;
     }
 
-    switch (node->gtHWIntrinsicId)
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+
+    switch (intrinsicId)
     {
+        case NI_Vector128_Create:
+        case NI_Vector256_Create:
+        {
+            // We don't directly support the Vector128.Create or Vector256.Create methods in codegen
+            // and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect
+            // that the node is modified to either not be a HWIntrinsic node or that it is no longer
+            // the same intrinsic as when it came in. In the case of Vector256.Create, we may lower
+            // it into 2x Vector128.Create intrinsics which themselves are also lowered into other
+            // intrinsics that are not Vector*.Create
+
+            LowerHWIntrinsicCreate(node);
+            assert(!node->OperIsHWIntrinsic() || (node->gtHWIntrinsicId != intrinsicId));
+            LowerNode(node);
+            return;
+        }
+
         case NI_SSE2_CompareGreaterThan:
         {
             if (node->gtSIMDBaseType != TYP_DOUBLE)
@@ -1081,6 +1099,1214 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
 
     ContainCheckHWIntrinsic(node);
 }
+
+union VectorConstant {
+    int8_t   i8[32];
+    uint8_t  u8[32];
+    int16_t  i16[16];
+    uint16_t u16[16];
+    int32_t  i32[8];
+    uint32_t u32[8];
+    int64_t  i64[4];
+    uint64_t u64[4];
+    float    f32[8];
+    double   f64[4];
+};
+
+//----------------------------------------------------------------------------------------------
+// ProcessArgForHWIntrinsicCreate: Processes an argument for the Lowering::LowerHWIntrinsicCreate method
+//
+//  Arguments:
+//     arg      - The argument to process
+//     argIdx   - The index of the argument being processed
+//     vecCns   - The vector constant being constructed
+//     baseType - The base type of the vector constant
+//
+//  Returns:
+//     true if arg was a constant; otherwise, false
+static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, VectorConstant& vecCns, var_types baseType)
+{
+    switch (baseType)
+    {
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        {
+            if (arg->IsCnsIntOrI())
+            {
+                vecCns.i8[argIdx] = static_cast<int8_t>(arg->AsIntCon()->gtIconVal);
+                return true;
+            }
+            else
+            {
+                // We expect the VectorConstant to have been already zeroed
+                assert(vecCns.i8[argIdx] == 0);
+            }
+            break;
+        }
+
+        case TYP_SHORT:
+        case TYP_USHORT:
+        {
+            if (arg->IsCnsIntOrI())
+            {
+                vecCns.i16[argIdx] = static_cast<int16_t>(arg->AsIntCon()->gtIconVal);
+                return true;
+            }
+            else
+            {
+                // We expect the VectorConstant to have been already zeroed
+                assert(vecCns.i16[argIdx] == 0);
+            }
+            break;
+        }
+
+        case TYP_INT:
+        case TYP_UINT:
+        {
+            if (arg->IsCnsIntOrI())
+            {
+                vecCns.i32[argIdx] = static_cast<int32_t>(arg->AsIntCon()->gtIconVal);
+                return true;
+            }
+            else
+            {
+                // We expect the VectorConstant to have been already zeroed
+                assert(vecCns.i32[argIdx] == 0);
+            }
+            break;
+        }
+
+        case TYP_LONG:
+        case TYP_ULONG:
+        {
+            if (arg->OperIs(GT_CNS_LNG))
+            {
+                vecCns.i64[argIdx] = static_cast<int64_t>(arg->AsLngCon()->gtLconVal);
+                return true;
+            }
+            else
+            {
+                // We expect the VectorConstant to have been already zeroed
+                assert(vecCns.i64[argIdx] == 0);
+            }
+            break;
+        }
+
+        case TYP_FLOAT:
+        {
+            if (arg->IsCnsFltOrDbl())
+            {
+                vecCns.f32[argIdx] = static_cast<float>(arg->AsDblCon()->gtDconVal);
+                return true;
+            }
+            else
+            {
+                // We expect the VectorConstant to have been already zeroed
+                // We check against the i32, rather than f32, to account for -0.0
+                assert(vecCns.i32[argIdx] == 0);
+            }
+            break;
+        }
+
+        case TYP_DOUBLE:
+        {
+            if (arg->IsCnsFltOrDbl())
+            {
+                vecCns.f64[argIdx] = static_cast<double>(arg->AsDblCon()->gtDconVal);
+                return true;
+            }
+            else
+            {
+                // We expect the VectorConstant to have been already zeroed
+                // We check against the i64, rather than f64, to account for -0.0
+                assert(vecCns.i64[argIdx] == 0);
+            }
+            break;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+
+    return false;
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicCreate: Lowers a Vector64, Vector128, or Vector256 Create call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    var_types      simdType    = node->gtType;
+    var_types      baseType    = node->gtSIMDBaseType;
+    unsigned       simdSize    = node->gtSIMDSize;
+    VectorConstant vecCns      = {};
+
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    GenTreeArgList* argList = nullptr;
+    GenTree*        op1     = node->gtGetOp1();
+    GenTree*        op2     = node->gtGetOp2();
+
+    // Spare GenTrees to be used for the lowering logic below
+    // Defined upfront to avoid naming conflicts, etc...
+    GenTree* idx  = nullptr;
+    GenTree* tmp1 = nullptr;
+    GenTree* tmp2 = nullptr;
+    GenTree* tmp3 = nullptr;
+
+    assert(op1 != nullptr);
+
+    unsigned argCnt    = 0;
+    unsigned cnsArgCnt = 0;
+
+    if (op1->OperIsList())
+    {
+        assert(op2 == nullptr);
+
+        for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
+        {
+            if (HandleArgForHWIntrinsicCreate(argList->Current(), argCnt, vecCns, baseType))
+            {
+                cnsArgCnt += 1;
+            }
+            argCnt += 1;
+        }
+    }
+    else
+    {
+        if (HandleArgForHWIntrinsicCreate(op1, argCnt, vecCns, baseType))
+        {
+            cnsArgCnt += 1;
+        }
+        argCnt += 1;
+
+        if (op2 != nullptr)
+        {
+            if (HandleArgForHWIntrinsicCreate(op2, argCnt, vecCns, baseType))
+            {
+                cnsArgCnt += 1;
+            }
+            argCnt += 1;
+        }
+        else if (cnsArgCnt == 1)
+        {
+            // These intrinsics are meant to set the same value to every element
+            // so we'll just specially handle it here and copy it into the remaining
+            // indices.
+
+            for (unsigned i = 1; i < simdSize / genTypeSize(baseType); i++)
+            {
+                HandleArgForHWIntrinsicCreate(op1, i, vecCns, baseType);
+            }
+        }
+    }
+    assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(baseType))));
+
+    if (argCnt == cnsArgCnt)
+    {
+        if (op1->OperIsList())
+        {
+            for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
+            {
+                BlockRange().Remove(argList->Current());
+            }
+        }
+        else
+        {
+            BlockRange().Remove(op1);
+
+            if (op2 != nullptr)
+            {
+                BlockRange().Remove(op2);
+            }
+        }
+
+        CORINFO_FIELD_HANDLE hnd = comp->GetEmitter()->emitAnyConst(&vecCns, simdSize, emitDataAlignment::Required);
+        GenTree* clsVarAddr      = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr);
+        BlockRange().InsertBefore(node, clsVarAddr);
+
+        node->ChangeOper(GT_IND);
+        node->gtOp1 = clsVarAddr;
+
+        // TODO-XARCH-CQ: We should be able to modify at least the paths that use Insert to trivially support partial
+        // vector constants. With this, we can create a constant if say 50% of the inputs are also constant and just
+        // insert the non-constant values which should still allow some gains.
+
+        return;
+    }
+    else if (argCnt == 1)
+    {
+        // We have the following (where simd is simd16 or simd32):
+        //          /--*  op1  T
+        //   node = *  HWINTRINSIC   simd   T Create
+
+        if (intrinsicId == NI_Vector256_Create)
+        {
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
+            {
+                // We will be constructing the following parts:
+                //          /--*  op1  T
+                //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                //          /--*  tmp1 simd16
+                //   node = *  HWINTRINSIC   simd32 T BroadcastScalarToVector256
+
+                // This is roughly the following managed code:
+                //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
+                //   return Avx2.BroadcastScalarToVector256(tmp1);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(op1, tmp1);
+                LowerNode(tmp1);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector256;
+                return;
+            }
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
+
+            // We will be constructing the following parts:
+            //          /--*  op1  T
+            //   tmp1 = *  HWINTRINSIC   simd16 T Create
+            //          /--*  tmp1 simd16
+            //          *  STORE_LCL_VAR simd16
+            //   tmp1 =    LCL_VAR       simd16
+            //   tmp2 =    LCL_VAR       simd16
+            //          /--*  tmp2 simd16
+            //   tmp3 = *  HWINTRINSIC   simd16 T ToVector256Unsafe
+            //   idx  =    CNS_INT       int    0
+            //          /--*  tmp3 simd32
+            //          +--*  tmp1 simd16
+            //          +--*  idx  int
+            //   node = *  HWINTRINSIC simd32 T InsertVector128
+
+            // This is roughly the following managed code:
+            //   var tmp1 = Vector128.Create(op1);
+            //   var tmp2 = tmp1;
+            //   var tmp3 = tmp2.ToVector256Unsafe();
+            //   return Avx.InsertVector128(tmp3, tmp1, 0x01);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertAfter(op1, tmp1);
+            LowerNode(tmp1);
+
+            node->gtOp1 = tmp1;
+            LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+            ReplaceWithLclVar(tmp1Use);
+            tmp1 = node->gtOp1;
+
+            tmp2 = comp->gtClone(tmp1);
+            BlockRange().InsertAfter(tmp1, tmp2);
+
+            tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, tmp2, NI_Vector128_ToVector256Unsafe, baseType, 16);
+            BlockRange().InsertAfter(tmp2, tmp3);
+            LowerNode(tmp3);
+
+            idx = comp->gtNewIconNode(0x01, TYP_INT);
+            BlockRange().InsertAfter(tmp3, idx);
+
+            node->gtOp1 = comp->gtNewArgList(tmp3, tmp1, idx);
+            node->gtOp2 = nullptr;
+
+            node->gtHWIntrinsicId = NI_AVX_InsertVector128;
+            return;
+        }
+
+        // We will be constructing the following parts:
+        //          /--*  op1  T
+        //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+        //   ...
+
+        // This is roughly the following managed code:
+        //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
+        //   ...
+
+        tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+        BlockRange().InsertAfter(op1, tmp1);
+        LowerNode(tmp1);
+
+        if ((baseType != TYP_DOUBLE) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
+        {
+            // We will be constructing the following parts:
+            //   ...
+            //           /--*  tmp1 simd16
+            //   node  = *  HWINTRINSIC   simd16 T BroadcastScalarToVector128
+
+            // This is roughly the following managed code:
+            //   ...
+            //   return Avx2.BroadcastScalarToVector128(tmp1);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = nullptr;
+
+            node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector128;
+            return;
+        }
+
+        switch (baseType)
+        {
+            case TYP_BYTE:
+            case TYP_UBYTE:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3))
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   tmp2 =    HWINTRINSIC   simd16 ubyte get_Zero
+                    //         /--*  tmp1 simd16
+                    //         +--*  tmp2 simd16
+                    //   node = *  HWINTRINSIC   simd16 ubyte Shuffle
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   var tmp2 = Vector128<byte>.Zero;
+                    //   return Ssse3.Shuffle(tmp1, tmp2);
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, NI_Vector128_get_Zero, TYP_UBYTE, simdSize);
+                    BlockRange().InsertAfter(tmp1, tmp2);
+                    LowerNode(tmp2);
+
+                    node->gtOp1 = tmp1;
+                    node->gtOp2 = tmp2;
+
+                    node->gtHWIntrinsicId = NI_SSSE3_Shuffle;
+                    break;
+                }
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   tmp1 = *  HWINTRINSIC   simd16 ubyte UnpackLow
+                //   ...
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   tmp1 = Sse2.UnpackLow(tmp1, tmp2);
+                //   ...
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
+                BlockRange().InsertAfter(tmp2, tmp1);
+                LowerNode(tmp1);
+
+                __fallthrough;
+            }
+
+            case TYP_SHORT:
+            case TYP_USHORT:
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   tmp1 = *  HWINTRINSIC   simd16 ushort UnpackLow
+                //   ...
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   tmp1 = Sse2.UnpackLow(tmp1, tmp2);
+                //   ...
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize);
+                BlockRange().InsertAfter(tmp2, tmp1);
+                LowerNode(tmp1);
+
+                __fallthrough;
+            }
+
+            case TYP_INT:
+            case TYP_UINT:
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //   idx  =    CNS_INT       int    0
+                //          /--*  tmp1 simd16
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 uint Shuffle
+
+                // This is roughly the following managed code:
+                //   ...
+                //   return Sse2.Shuffle(tmp1, 0x00);
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                idx = comp->gtNewIconNode(0x00, TYP_INT);
+                BlockRange().InsertAfter(tmp1, idx);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = idx;
+
+                node->gtHWIntrinsicId = NI_SSE2_Shuffle;
+                node->gtSIMDBaseType  = TYP_UINT;
+
+                break;
+            }
+
+#if defined(TARGET_AMD64)
+            case TYP_LONG:
+            case TYP_ULONG:
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   node = *  HWINTRINSIC simd16 ulong UnpackLow
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   return Sse2.UnpackLow(tmp1, tmp2);
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = tmp2;
+
+                node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
+                break;
+            }
+#endif // TARGET_AMD64
+
+            case TYP_FLOAT:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    0
+                    //          /--*  tmp1 simd16
+                    //          +--*  idx  int
+                    //   node = *  HWINTRINSIC   simd16 float Permute
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   return Avx.Permute(tmp1, 0x00);
+
+                    idx = comp->gtNewIconNode(0x00, TYP_INT);
+                    BlockRange().InsertAfter(tmp1, idx);
+
+                    node->gtOp1 = tmp1;
+                    node->gtOp2 = idx;
+
+                    node->gtHWIntrinsicId = NI_AVX_Permute;
+                    break;
+                }
+
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //   idx  =    CNS_INT       int    0
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 float Shuffle
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   return Sse.Shuffle(tmp1, tmp2, 0x00);
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                idx = comp->gtNewIconNode(0x00, TYP_INT);
+                BlockRange().InsertAfter(tmp2, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_SSE_Shuffle;
+                break;
+            }
+
+            case TYP_DOUBLE:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //          /--*  tmp1 simd16
+                    //   node = *  HWINTRINSIC   simd16 double MoveAndDuplicate
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   return Sse3.MoveAndDuplicate(tmp1);
+
+                    node->gtOp1 = tmp1;
+                    node->gtOp2 = nullptr;
+
+                    node->gtHWIntrinsicId = NI_SSE3_MoveAndDuplicate;
+                    break;
+                }
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   node = *  HWINTRINSIC   simd16 float MoveLowToHigh
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   return Sse.MoveLowToHigh(tmp1, tmp2);
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = tmp2;
+
+                node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
+                node->gtSIMDBaseType  = TYP_FLOAT;
+
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        return;
+    }
+
+    // We have the following (where simd is simd16 or simd32):
+    //          /--*  op1 T
+    //          +--*  ... T
+    //          +--*  opN T
+    //   node = *  HWINTRINSIC   simd   T Create
+
+    if (intrinsicId == NI_Vector256_Create)
+    {
+        assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
+
+        // We will be constructing the following parts:
+        //          /--*  op1 T
+        //          +--*  ... T
+        //   lo   = *  HWINTRINSIC   simd16 T Create
+        //          /--*  ... T
+        //          +--*  opN T
+        //   hi   = *  HWINTRINSIC   simd16 T Create
+        //   idx  =    CNS_INT       int    1
+        //          /--*  lo   simd32
+        //          +--*  hi   simd16
+        //          +--*  idx  int
+        //   node = *  HWINTRINSIC   simd32 T InsertVector128
+
+        // This is roughly the following managed code:
+        //   ...
+        //   var lo   = Vector128.Create(op1, ...);
+        //   var hi   = Vector128.Create(..., opN);
+        //   return Avx.InsertVector128(lo, hi, 0x01);
+
+        // Each Vector128.Create call gets half the operands. That is:
+        //   lo = Vector128.Create(op1, op2);
+        //   hi = Vector128.Create(op3, op4);
+        // -or-
+        //   lo = Vector128.Create(op1,  ..., op3);
+        //   hi = Vector128.Create(op4,  ..., op7);
+        // -or-
+        //   lo = Vector128.Create(op1,  ..., op7);
+        //   hi = Vector128.Create(op8,  ..., op15);
+        // -or-
+        //   lo = Vector128.Create(op1,  ..., op15);
+        //   hi = Vector128.Create(op16, ..., op31);
+
+        unsigned halfArgCnt = argCnt / 2;
+        assert((halfArgCnt * 2) == argCnt);
+
+        argList = op1->AsArgList();
+
+        for (unsigned i = 0; i < halfArgCnt; i++)
+        {
+            op2     = argList;
+            argList = argList->Rest();
+        }
+
+        op2->AsArgList()->gtOp2 = nullptr;
+        op2                     = argList;
+
+        // The above for loop splits the operand count into exactly half.
+        // Once it exits, op1 will point to op1 and op2 will point to the
+        // last operand that will be passed to the first Vector128.Create
+        // We will set its op2 to null, terminating the chain and then
+        // assign  op2 to be argList, which is the first operand that will
+        // get passed to the second Vector128.Create
+
+        GenTree* lo = nullptr;
+        GenTree* hi = nullptr;
+
+        if (halfArgCnt == 2)
+        {
+            // The Vector256.Create calls that take 4 operands are special
+            // because the half argument count is 2, which means we can't
+            // actually use the GT_LIST anymore and need to pass them as
+            // explicit operands instead.
+
+            argList = op1->AsArgList();
+
+            tmp1 = argList->Current();
+            tmp2 = argList->Rest()->Current();
+
+            lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertAfter(tmp2, lo);
+            LowerNode(lo);
+
+            argList = op2->AsArgList();
+
+            tmp1 = argList->Current();
+            tmp2 = argList->Rest()->Current();
+
+            hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertAfter(tmp2, hi);
+            LowerNode(hi);
+        }
+        else
+        {
+            // The rest of the Vector256.Create calls take at least 8 operands
+            // and so the half count is at least 4 and we have to continue
+            // passing around GT_LIST nodes in op1 with a null op2
+            assert(halfArgCnt >= 4);
+
+            tmp1 = op2->AsArgList()->Current();
+
+            lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertBefore(tmp1, lo);
+            LowerNode(lo);
+
+            hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertBefore(node, hi);
+            LowerNode(hi);
+        }
+
+        idx = comp->gtNewIconNode(0x01, TYP_INT);
+        BlockRange().InsertAfter(hi, idx);
+
+        node->gtOp1 = comp->gtNewArgList(lo, hi, idx);
+        node->gtOp2 = nullptr;
+
+        node->gtHWIntrinsicId = NI_AVX_InsertVector128;
+        return;
+    }
+
+    if (op1->OperIsList())
+    {
+        argList = op1->AsArgList();
+        op1     = argList->Current();
+        argList = argList->Rest();
+    }
+
+    // We will be constructing the following parts:
+    //          /--*  op1  T
+    //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+    //   ...
+
+    // This is roughly the following managed code:
+    //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
+    //   ...
+
+    tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+    BlockRange().InsertAfter(op1, tmp1);
+    LowerNode(tmp1);
+
+    switch (baseType)
+    {
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        case TYP_SHORT:
+        case TYP_USHORT:
+        case TYP_INT:
+        case TYP_UINT:
+        {
+            unsigned       N            = 0;
+            GenTree*       opN          = nullptr;
+            NamedIntrinsic insIntrinsic = NI_Illegal;
+
+            if ((baseType == TYP_SHORT) || (baseType == TYP_USHORT))
+            {
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+                insIntrinsic = NI_SSE2_Insert;
+            }
+            else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+            {
+                insIntrinsic = NI_SSE41_Insert;
+            }
+
+            if (insIntrinsic != NI_Illegal)
+            {
+                for (N = 1; N < argCnt - 1; N++)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    N
+                    //          /--*  tmp1 simd16
+                    //          +--*  opN  T
+                    //          +--*  idx  int
+                    //   tmp1 = *  HWINTRINSIC   simd16 T Insert
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp1 = Sse?.Insert(tmp1, opN, N);
+                    //   ...
+
+                    opN = argList->Current();
+
+                    idx = comp->gtNewIconNode(N, TYP_INT);
+                    BlockRange().InsertAfter(opN, idx);
+
+                    tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, opN, idx, insIntrinsic, baseType, simdSize);
+                    BlockRange().InsertAfter(idx, tmp1);
+                    LowerNode(tmp1);
+
+                    argList = argList->Rest();
+                }
+
+                assert(N == (argCnt - 1));
+
+                // We will be constructing the following parts:
+                //   idx  =    CNS_INT       int    N
+                //          /--*  tmp1 simd16
+                //          +--*  opN  T
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 T Insert
+
+                // This is roughly the following managed code:
+                //   ...
+                //   tmp1 = Sse?.Insert(tmp1, opN, N);
+                //   ...
+
+                opN = argList->Current();
+
+                idx = comp->gtNewIconNode(N, TYP_INT);
+                BlockRange().InsertAfter(opN, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, opN, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = insIntrinsic;
+                break;
+            }
+
+            assert((baseType != TYP_SHORT) && (baseType != TYP_USHORT));
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+            GenTree* op[16];
+            op[0] = tmp1;
+
+            for (N = 1; N < argCnt; N++)
+            {
+                opN = argList->Current();
+
+                op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(opN, op[N]);
+                LowerNode(op[N]);
+
+                argList = argList->Rest();
+            }
+            assert(argList == nullptr);
+
+            if ((baseType == TYP_BYTE) || (baseType == TYP_UBYTE))
+            {
+                for (N = 0; N < argCnt; N += 4)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //          /--*  opN  T
+                    //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opO  T
+                    //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opN  simd16
+                    //          +--*  opO  simd16
+                    //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
+                    //          /--*  opP  T
+                    //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opQ  T
+                    //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opP  simd16
+                    //          +--*  opQ  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
+                    //          /--*  tmp1 simd16
+                    //          +--*  tmp2 simd16
+                    //   tmp3  = *  HWINTRINSIC   simd16 T UnpackLow
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp1 = Sse2.UnpackLow(opN, opO);
+                    //   tmp2 = Sse2.UnpackLow(opP, opQ);
+                    //   tmp3 = Sse2.UnpackLow(tmp1, tmp2);
+                    //   ...
+
+                    unsigned O = N + 1;
+                    unsigned P = N + 2;
+                    unsigned Q = N + 3;
+
+                    tmp1 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, op[N], op[O], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
+                    BlockRange().InsertAfter(op[O], tmp1);
+                    LowerNode(tmp1);
+
+                    tmp2 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, op[P], op[Q], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
+                    BlockRange().InsertAfter(op[Q], tmp2);
+                    LowerNode(tmp2);
+
+                    tmp3 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize);
+                    BlockRange().InsertAfter(tmp2, tmp3);
+                    LowerNode(tmp3);
+
+                    // This caches the result in index 0 through 3, depending on which
+                    // loop iteration this is and allows the rest of the logic to be
+                    // shared with the TYP_INT and TYP_UINT path.
+
+                    op[N / 4] = tmp3;
+                }
+            }
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  opN  T
+            //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opO  T
+            //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opN  simd16
+            //          +--*  opO  simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  opP  T
+            //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opQ  T
+            //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opP  simd16
+            //          +--*  opQ  simd16
+            //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T UnpackLow
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Sse2.UnpackLow(opN, opO);
+            //   tmp2 = Sse2.UnpackLow(opP, opQ);
+            //   return Sse2.UnpackLow(tmp1, tmp2);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE2_UnpackLow, TYP_UINT, simdSize);
+            BlockRange().InsertAfter(op[1], tmp1);
+            LowerNode(tmp1);
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE2_UnpackLow, TYP_UINT, simdSize);
+            BlockRange().InsertAfter(op[3], tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
+            node->gtSIMDBaseType  = TYP_ULONG;
+            break;
+        }
+
+#if defined(TARGET_AMD64)
+        case TYP_LONG:
+        case TYP_ULONG:
+        {
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64))
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //   idx  =    CNS_INT       int    1
+                //          /--*  tmp1 simd16
+                //          +--*  op2  T
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 T Insert
+
+                // This is roughly the following managed code:
+                //   ...
+                //   return Sse41.X64.Insert(tmp1, op2, 0x01);
+
+                idx = comp->gtNewIconNode(0x01, TYP_INT);
+                BlockRange().InsertAfter(op2, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, op2, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_SSE41_X64_Insert;
+                break;
+            }
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  op2  T
+            //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T UnpackLow
+
+            // This is roughly the following managed code:
+            //   ...
+            //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
+            //   return Sse2.UnpackLow(tmp1, tmp2);
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+            BlockRange().InsertAfter(op2, tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
+            break;
+        }
+#endif // TARGET_AMD64
+
+        case TYP_FLOAT:
+        {
+            unsigned N   = 0;
+            GenTree* opN = nullptr;
+
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+            {
+                for (N = 1; N < argCnt - 1; N++)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //
+                    //          /--*  opN  T
+                    //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //   idx  =    CNS_INT       int    N
+                    //          /--*  tmp1 simd16
+                    //          +--*  opN  T
+                    //          +--*  idx  int
+                    //   tmp1 = *  HWINTRINSIC   simd16 T Insert
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp2 = Vector128.CreateScalarUnsafe(opN);
+                    //   tmp1 = Sse41.Insert(tmp1, tmp2, N << 4);
+                    //   ...
+
+                    opN = argList->Current();
+
+                    tmp2 =
+                        comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                    BlockRange().InsertAfter(opN, tmp2);
+                    LowerNode(tmp2);
+
+                    idx = comp->gtNewIconNode(N << 4, TYP_INT);
+                    BlockRange().InsertAfter(tmp2, idx);
+
+                    tmp1 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, idx, NI_SSE41_Insert, baseType, simdSize);
+                    BlockRange().InsertAfter(idx, tmp1);
+                    LowerNode(tmp1);
+
+                    argList = argList->Rest();
+                }
+
+                // We will be constructing the following parts:
+                //   ...
+                //
+                //          /--*  opN  T
+                //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                //   idx  =    CNS_INT       int    N
+                //          /--*  tmp1 simd16
+                //          +--*  opN  T
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 T Insert
+
+                // This is roughly the following managed code:
+                //   ...
+                //   tmp2 = Vector128.CreateScalarUnsafe(opN);
+                //   return Sse41.Insert(tmp1, tmp2, N << 4);
+
+                opN = argList->Current();
+
+                tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(opN, tmp2);
+                LowerNode(tmp2);
+
+                idx = comp->gtNewIconNode((argCnt - 1) << 4, TYP_INT);
+                BlockRange().InsertAfter(tmp2, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_SSE41_Insert;
+                break;
+            }
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  opN  T
+            //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opO  T
+            //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opN  simd16
+            //          +--*  opO  simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  opP  T
+            //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opQ  T
+            //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opP  simd16
+            //          +--*  opQ  simd16
+            //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T MoveLowToHigh
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Sse.UnpackLow(opN, opO);
+            //   tmp2 = Sse.UnpackLow(opP, opQ);
+            //   return Sse.MoveLowToHigh(tmp1, tmp2);
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
+
+            GenTree* op[4];
+            op[0] = tmp1;
+
+            for (N = 1; N < argCnt; N++)
+            {
+                opN = argList->Current();
+
+                op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(opN, op[N]);
+                LowerNode(op[N]);
+
+                argList = argList->Rest();
+            }
+            assert(argList == nullptr);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE_UnpackLow, baseType, simdSize);
+            BlockRange().InsertAfter(op[1], tmp1);
+            LowerNode(tmp1);
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE_UnpackLow, baseType, simdSize);
+            BlockRange().InsertAfter(op[3], tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
+            break;
+        }
+
+        case TYP_DOUBLE:
+        {
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  op2  T
+            //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T MoveLowToHigh
+
+            // This is roughly the following managed code:
+            //   ...
+            //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
+            //   return Sse.MoveLowToHigh(tmp1, tmp2);
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+            BlockRange().InsertAfter(op2, tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
+            node->gtSIMDBaseType  = TYP_FLOAT;
+
+            break;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+}
 #endif // FEATURE_HW_INTRINSICS
 
 //----------------------------------------------------------------------------------------------
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
index 5d55d00f11b8ae99c1a77ac9ca45d411f9c21170..db15743c6dcba4db42a20e347e0bb42baeb45394 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
@@ -259,31 +259,12 @@ public static Vector<T> AsVector<T>(this Vector128<T> value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi8</remarks>
         /// <returns>A new <see cref="Vector128{Byte}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [Intrinsic]
         public static unsafe Vector128<byte> Create(byte value)
         {
-            if (Avx2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<byte> result = CreateScalarUnsafe(value);                         // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector128(result);                             // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
-            if (Ssse3.IsSupported)
-            {
-                Vector128<byte> result = CreateScalarUnsafe(value);                         // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Ssse3.Shuffle(result, Vector128<byte>.Zero);                         // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
-            if (Sse2.IsSupported)
-            {
-                // We first unpack as bytes to duplicate value into the lower 2 bytes, then we treat it as a ushort and unpack again to duplicate those
-                // bits into the lower 2 words, we can finally treat it as a uint and shuffle the lower dword to duplicate value across the entire result
-
-                Vector128<byte> result = CreateScalarUnsafe(value);                         // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.UnpackLow(result, result);                                    // < v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.UnpackLow(result.AsUInt16(), result.AsUInt16()).AsByte();     // < v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Sse2.Shuffle(result.AsUInt32(), 0x00).AsByte();                      // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -318,23 +299,12 @@ static Vector128<byte> SoftwareFallback(byte value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128d _mm_set1_pd</remarks>
         /// <returns>A new <see cref="Vector128{Double}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [Intrinsic]
         public static unsafe Vector128<double> Create(double value)
         {
-            if (Sse3.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<double> result = CreateScalarUnsafe(value);                       // < v, ? >
-                return Sse3.MoveAndDuplicate(result);                                       // < v, v >
-            }
-
-            if (Sse2.IsSupported)
-            {
-                // Treating the value as a set of singles and emitting MoveLowToHigh is more efficient than dealing with the elements directly as double
-                // However, we still need to check if Sse2 is supported since CreateScalarUnsafe needs it to for movsd, when value is not already in register
-
-                Vector128<double> result = CreateScalarUnsafe(value);                       // < v, ? >
-                return Sse.MoveLowToHigh(result.AsSingle(), result.AsSingle()).AsDouble();  // < v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -355,24 +325,12 @@ static Vector128<double> SoftwareFallback(double value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi16</remarks>
         /// <returns>A new <see cref="Vector128{Int16}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [Intrinsic]
         public static unsafe Vector128<short> Create(short value)
         {
-            if (Avx2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<short> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector128(result);                             // < v, v, v, v, v, v, v, v >
-            }
-
-            if (Sse2.IsSupported)
-            {
-                // We first unpack as ushort to duplicate value into the lower 2 words, then we can treat it as a uint and shuffle the lower dword to
-                // duplicate value across the entire result
-
-                Vector128<short> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.UnpackLow(result, result);                                    // < v, v, ?, ?, ?, ?, ?, ? >
-                return Sse2.Shuffle(result.AsInt32(), 0x00).AsInt16();                      // < v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -399,20 +357,12 @@ static Vector128<short> SoftwareFallback(short value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi32</remarks>
         /// <returns>A new <see cref="Vector128{Int32}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [Intrinsic]
         public static unsafe Vector128<int> Create(int value)
         {
-            if (Avx2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<int> result = CreateScalarUnsafe(value);                          // < v, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector128(result);                             // < v, v, v, v >
-            }
-
-            if (Sse2.IsSupported)
-            {
-                Vector128<int> result = CreateScalarUnsafe(value);                          // < v, ?, ?, ? >
-                return Sse2.Shuffle(result, 0x00);                                          // < v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -435,22 +385,12 @@ static Vector128<int> SoftwareFallback(int value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi64x</remarks>
         /// <returns>A new <see cref="Vector128{Int64}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [Intrinsic]
         public static unsafe Vector128<long> Create(long value)
         {
-            if (Sse2.X64.IsSupported)
+            if (Sse2.X64.IsSupported || AdvSimd.IsSupported)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector128<long> result = CreateScalarUnsafe(value);                     // < v, ? >
-                    return Avx2.BroadcastScalarToVector128(result);                         // < v, v >
-                }
-                else
-                {
-                    Vector128<long> result = CreateScalarUnsafe(value);                     // < v, ? >
-                    return Sse2.UnpackLow(result, result);                                  // < v, v >
-                }
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -471,32 +411,13 @@ static Vector128<long> SoftwareFallback(long value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi8</remarks>
         /// <returns>A new <see cref="Vector128{SByte}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector128<sbyte> Create(sbyte value)
         {
-            if (Avx2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<sbyte> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector128(result);                             // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
-            if (Ssse3.IsSupported)
-            {
-                Vector128<sbyte> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Ssse3.Shuffle(result, Vector128<sbyte>.Zero);                        // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
-            if (Sse2.IsSupported)
-            {
-                // We first unpack as bytes to duplicate value into the lower 2 bytes, then we treat it as a ushort and unpack again to duplicate those
-                // bits into the lower 2 words, we can finally treat it as a uint and shuffle the lower dword to duplicate value across the entire result
-
-                Vector128<sbyte> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.UnpackLow(result, result);                                    // < v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.UnpackLow(result.AsInt16(), result.AsInt16()).AsSByte();      // < v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Sse2.Shuffle(result.AsInt32(), 0x00).AsSByte();                      // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -531,26 +452,12 @@ static Vector128<sbyte> SoftwareFallback(sbyte value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128 _mm_set1_ps</remarks>
         /// <returns>A new <see cref="Vector128{Single}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         [Intrinsic]
         public static unsafe Vector128<float> Create(float value)
         {
-            if (Avx2.IsSupported)
+            if (Sse.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<float> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector128(result);                             // < v, v, v, v >
-            }
-
-            if (Avx.IsSupported)
-            {
-                Vector128<float> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ? >
-                return Avx.Permute(result, 0x00);                                           // < v, v, v, v >
-            }
-
-            if (Sse.IsSupported)
-            {
-                Vector128<float> result = CreateScalarUnsafe(value);                        // < v, ?, ?, ? >
-                return Sse.Shuffle(result, result, 0x00);                                   // < v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -573,25 +480,13 @@ static Vector128<float> SoftwareFallback(float value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi16</remarks>
         /// <returns>A new <see cref="Vector128{UInt16}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector128<ushort> Create(ushort value)
         {
-            if (Avx2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<ushort> result = CreateScalarUnsafe(value);                       // < v, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector128(result);                             // < v, v, v, v, v, v, v, v >
-            }
-
-            if (Sse2.IsSupported)
-            {
-                // We first unpack as ushort to duplicate value into the lower 2 words, then we can treat it as a uint and shuffle the lower dword to
-                // duplicate value across the entire result
-
-                Vector128<ushort> result = CreateScalarUnsafe(value);                       // < v, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.UnpackLow(result, result);                                    // < v, v, ?, ?, ?, ?, ?, ? >
-                return Sse2.Shuffle(result.AsUInt32(), 0x00).AsUInt16();                    // < v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -618,21 +513,13 @@ static Vector128<ushort> SoftwareFallback(ushort value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi32</remarks>
         /// <returns>A new <see cref="Vector128{UInt32}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector128<uint> Create(uint value)
         {
-            if (Avx2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
-                Vector128<uint> result = CreateScalarUnsafe(value);                         // < v, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector128(result);                             // < v, v, v, v >
-            }
-
-            if (Sse2.IsSupported)
-            {
-                Vector128<uint> result = CreateScalarUnsafe(value);                         // < v, ?, ?, ? >
-                return Sse2.Shuffle(result, 0x00);                                          // < v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -655,23 +542,13 @@ static Vector128<uint> SoftwareFallback(uint value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_set1_epi64x</remarks>
         /// <returns>A new <see cref="Vector128{UInt64}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector128<ulong> Create(ulong value)
         {
-            if (Sse2.X64.IsSupported)
+            if (Sse2.X64.IsSupported || AdvSimd.IsSupported)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector128<ulong> result = CreateScalarUnsafe(value);                    // < v, ? >
-                    return Avx2.BroadcastScalarToVector128(result);                         // < v, v >
-                }
-                else
-                {
-                    Vector128<ulong> result = CreateScalarUnsafe(value);                    // < v, ? >
-                    return Sse2.UnpackLow(result, result);                                  // < v, v >
-                }
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -707,62 +584,15 @@ static Vector128<ulong> SoftwareFallback(ulong value)
         /// <param name="e15">The value that element 15 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi8</remarks>
         /// <returns>A new <see cref="Vector128{Byte}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector128<byte> Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7, byte e8, byte e9, byte e10, byte e11, byte e12, byte e13, byte e14, byte e15)
         {
-            if (Sse41.IsSupported)
-            {
-                Vector128<byte> result = CreateScalarUnsafe(e0);                                    // <  0, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e1, 1);                                               // <  0,  1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e2, 2);                                               // <  0,  1,  2, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e3, 3);                                               // <  0,  1,  2,  3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e4, 4);                                               // <  0,  1,  2,  3,  4, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e5, 5);                                               // <  0,  1,  2,  3,  4,  5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e6, 6);                                               // <  0,  1,  2,  3,  4,  5,  6, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e7, 7);                                               // <  0,  1,  2,  3,  4,  5,  6,  7, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e8, 8);                                               // <  0,  1,  2,  3,  4,  5,  6,  7,  8, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e9, 9);                                               // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e10, 10);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e11, 11);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e12, 12);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, ??, ??, ?? >
-                result = Sse41.Insert(result, e13, 13);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, ??, ?? >
-                result = Sse41.Insert(result, e14, 14);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, ?? >
-                return Sse41.Insert(result, e15, 15);                                               // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 >
-            }
-
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.IsSupported)
             {
-                // We deal with the elements in order, unpacking the ordered pairs of bytes into vectors. We then treat those vectors as ushort and
-                // unpack them again, then again treating those results as uint, and a final time treating them as ulong. This efficiently gets all
-                // bytes ordered into the result.
-
-                Vector128<ushort> lo16, hi16;
-                Vector128<uint> lo32, hi32;
-                Vector128<ulong> lo64, hi64;
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsUInt16();   // <  0,  1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsUInt16();   // <  2,  3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                lo32 = Sse2.UnpackLow(lo16, hi16).AsUInt32();                                       // <  0,  1,  2,  3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e4), CreateScalarUnsafe(e5)).AsUInt16();   // <  4,  5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e6), CreateScalarUnsafe(e7)).AsUInt16();   // <  6,  7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi32 = Sse2.UnpackLow(lo16, hi16).AsUInt32();                                       // <  4,  5,  6,  7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo64 = Sse2.UnpackLow(lo32, hi32).AsUInt64();                                       // <  0,  1,  2,  3,  4,  5,  6,  7, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e8), CreateScalarUnsafe(e9)).AsUInt16();   // <  8,  9, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e10), CreateScalarUnsafe(e11)).AsUInt16(); // < 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                lo32 = Sse2.UnpackLow(lo16, hi16).AsUInt32();                                       // <  8,  9, 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e12), CreateScalarUnsafe(e13)).AsUInt16(); // < 12, 13, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e14), CreateScalarUnsafe(e15)).AsUInt16(); // < 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi32 = Sse2.UnpackLow(lo16, hi16).AsUInt32();                                       // < 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                hi64 = Sse2.UnpackLow(lo32, hi32).AsUInt64();                                       // <  8,  9, 10, 11, 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                return Sse2.UnpackLow(lo64, hi64).AsByte();                                         // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 >
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<byte> result = CreateScalarUnsafe(e0);
@@ -782,6 +612,7 @@ public static unsafe Vector128<byte> Create(byte e0, byte e1, byte e2, byte e3,
                 result = AdvSimd.Insert(result, 14, e14);
                 return AdvSimd.Insert(result, 15, e15);
             }
+#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
 
@@ -816,22 +647,21 @@ static Vector128<byte> SoftwareFallback(byte e0, byte e1, byte e2, byte e3, byte
         /// <param name="e1">The value that element 1 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128d _mm_setr_pd</remarks>
         /// <returns>A new <see cref="Vector128{Double}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector128<double> Create(double e0, double e1)
         {
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.IsSupported)
             {
-                // Treating the value as a set of singles and emitting MoveLowToHigh is more efficient than dealing with the elements directly as double
-                // However, we still need to check if Sse2 is supported since CreateScalarUnsafe needs it to for movsd, when value is not already in register
-
-                return Sse.MoveLowToHigh(CreateScalarUnsafe(e0).AsSingle(), CreateScalarUnsafe(e1).AsSingle()).AsDouble();
+                return Create(e0, e1);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<double> result = CreateScalarUnsafe(e0);
                 return AdvSimd.Insert(result, 1, e1);
             }
+#endif
 
             return SoftwareFallback(e0, e1);
 
@@ -858,21 +688,15 @@ static Vector128<double> SoftwareFallback(double e0, double e1)
         /// <param name="e7">The value that element 7 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi16</remarks>
         /// <returns>A new <see cref="Vector128{Int16}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector128<short> Create(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7)
         {
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.IsSupported)
             {
-                Vector128<short> result = CreateScalarUnsafe(e0);                                   // < 0, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e1, 1);                                                // < 0, 1, ?, ?, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e2, 2);                                                // < 0, 1, 2, ?, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e3, 3);                                                // < 0, 1, 2, 3, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e4, 4);                                                // < 0, 1, 2, 3, 4, ?, ?, ? >
-                result = Sse2.Insert(result, e5, 5);                                                // < 0, 1, 2, 3, 4, 5, ?, ? >
-                result = Sse2.Insert(result, e6, 6);                                                // < 0, 1, 2, 3, 4, 5, 6, ? >
-                return Sse2.Insert(result, e7, 7);                                                  // < 0, 1, 2, 3, 4, 5, 6, 7 >
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<short> result = CreateScalarUnsafe(e0);
@@ -884,6 +708,7 @@ public static unsafe Vector128<short> Create(short e0, short e1, short e2, short
                 result = AdvSimd.Insert(result, 6, e6);
                 return AdvSimd.Insert(result, 7, e7);
             }
+#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
 
@@ -912,28 +737,15 @@ static Vector128<short> SoftwareFallback(short e0, short e1, short e2, short e3,
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi32</remarks>
         /// <returns>A new <see cref="Vector128{Int32}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector128<int> Create(int e0, int e1, int e2, int e3)
         {
-            if (Sse41.IsSupported)
-            {
-                Vector128<int> result = CreateScalarUnsafe(e0);                                     // < 0, ?, ?, ? >
-                result = Sse41.Insert(result, e1, 1);                                               // < 0, 1, ?, ? >
-                result = Sse41.Insert(result, e2, 2);                                               // < 0, 1, 2, ? >
-                return Sse41.Insert(result, e3, 3);                                                 // < 0, 1, 2, 3 >
-            }
-
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.IsSupported)
             {
-                // We deal with the elements in order, unpacking the ordered pairs of int into vectors. We then treat those vectors as ulong and
-                // unpack them again. This efficiently gets all ints ordered into the result.
-
-                Vector128<long> lo64, hi64;
-                lo64 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsInt64();    // < 0, 1, ?, ? >
-                hi64 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsInt64();    // < 2, 3, ?, ? >
-                return Sse2.UnpackLow(lo64, hi64).AsInt32();                                        // < 0, 1, 2, 3 >
+                return Create(e0, e1, e2, e3);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<int> result = CreateScalarUnsafe(e0);
@@ -941,6 +753,7 @@ public static unsafe Vector128<int> Create(int e0, int e1, int e2, int e3)
                 result = AdvSimd.Insert(result, 2, e2);
                 return AdvSimd.Insert(result, 3, e3);
             }
+#endif
 
             return SoftwareFallback(e0, e1, e2, e3);
 
@@ -963,25 +776,21 @@ static Vector128<int> SoftwareFallback(int e0, int e1, int e2, int e3)
         /// <param name="e1">The value that element 1 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi64x</remarks>
         /// <returns>A new <see cref="Vector128{Int64}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector128<long> Create(long e0, long e1)
         {
-            if (Sse41.X64.IsSupported)
-            {
-                Vector128<long> result = CreateScalarUnsafe(e0);                                    // < 0, ? >
-                return Sse41.X64.Insert(result, e1, 1);                                             // < 0, 1 >
-            }
-
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.X64.IsSupported)
             {
-                return Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1));              // < 0, 1 >
+                return Create(e0, e1);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<long> result = CreateScalarUnsafe(e0);
                 return AdvSimd.Insert(result, 1, e1);
             }
+#endif
 
             return SoftwareFallback(e0, e1);
 
@@ -1016,63 +825,16 @@ static Vector128<long> SoftwareFallback(long e0, long e1)
         /// <param name="e15">The value that element 15 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi8</remarks>
         /// <returns>A new <see cref="Vector128{SByte}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector128<sbyte> Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7, sbyte e8, sbyte e9, sbyte e10, sbyte e11, sbyte e12, sbyte e13, sbyte e14, sbyte e15)
         {
-            if (Sse41.IsSupported)
-            {
-                Vector128<sbyte> result = CreateScalarUnsafe(e0);                                   // <  0, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e1, 1);                                               // <  0,  1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e2, 2);                                               // <  0,  1,  2, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e3, 3);                                               // <  0,  1,  2,  3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e4, 4);                                               // <  0,  1,  2,  3,  4, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e5, 5);                                               // <  0,  1,  2,  3,  4,  5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e6, 6);                                               // <  0,  1,  2,  3,  4,  5,  6, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e7, 7);                                               // <  0,  1,  2,  3,  4,  5,  6,  7, ??, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e8, 8);                                               // <  0,  1,  2,  3,  4,  5,  6,  7,  8, ??, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e9, 9);                                               // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, ??, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e10, 10);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, ??, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e11, 11);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, ??, ??, ??, ?? >
-                result = Sse41.Insert(result, e12, 12);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, ??, ??, ?? >
-                result = Sse41.Insert(result, e13, 13);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, ??, ?? >
-                result = Sse41.Insert(result, e14, 14);                                             // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, ?? >
-                return Sse41.Insert(result, e15, 15);                                               // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 >
-            }
-
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.IsSupported)
             {
-                // We deal with the elements in order, unpacking the ordered pairs of bytes into vectors. We then treat those vectors as ushort and
-                // unpack them again, then again treating those results as uint, and a final time treating them as ulong. This efficiently gets all
-                // bytes ordered into the result.
-
-                Vector128<short> lo16, hi16;
-                Vector128<int> lo32, hi32;
-                Vector128<long> lo64, hi64;
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsInt16();    // <  0,  1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsInt16();    // <  2,  3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                lo32 = Sse2.UnpackLow(lo16, hi16).AsInt32();                                        // <  0,  1,  2,  3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e4), CreateScalarUnsafe(e5)).AsInt16();    // <  4,  5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e6), CreateScalarUnsafe(e7)).AsInt16();    // <  6,  7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi32 = Sse2.UnpackLow(lo16, hi16).AsInt32();                                        // <  4,  5,  6,  7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo64 = Sse2.UnpackLow(lo32, hi32).AsInt64();                                        // <  0,  1,  2,  3,  4,  5,  6,  7, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e8), CreateScalarUnsafe(e9)).AsInt16();    // <  8,  9, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e10), CreateScalarUnsafe(e11)).AsInt16();  // < 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                lo32 = Sse2.UnpackLow(lo16, hi16).AsInt32();                                        // <  8,  9, 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e12), CreateScalarUnsafe(e13)).AsInt16();  // < 12, 13, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e14), CreateScalarUnsafe(e15)).AsInt16();  // < 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-                hi32 = Sse2.UnpackLow(lo16, hi16).AsInt32();                                        // < 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                hi64 = Sse2.UnpackLow(lo32, hi32).AsInt64();                                        // <  8,  9, 10, 11, 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ?? >
-
-                return Sse2.UnpackLow(lo64, hi64).AsSByte();                                        // <  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 >
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<sbyte> result = CreateScalarUnsafe(e0);
@@ -1092,6 +854,7 @@ public static unsafe Vector128<sbyte> Create(sbyte e0, sbyte e1, sbyte e2, sbyte
                 result = AdvSimd.Insert(result, 14, e14);
                 return AdvSimd.Insert(result, 15, e15);
             }
+#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
 
@@ -1128,25 +891,15 @@ static Vector128<sbyte> SoftwareFallback(sbyte e0, sbyte e1, sbyte e2, sbyte e3,
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128 _mm_setr_ps</remarks>
         /// <returns>A new <see cref="Vector128{Single}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector128<float> Create(float e0, float e1, float e2, float e3)
         {
-            if (Sse41.IsSupported)
-            {
-                Vector128<float> result = CreateScalarUnsafe(e0);                                   // < 0, ?, ?, ? >
-                result = Sse41.Insert(result, CreateScalarUnsafe(e1), 0x10);                        // < 0, 1, ?, ? >
-                result = Sse41.Insert(result, CreateScalarUnsafe(e2), 0x20);                        // < 0, 1, 2, ? >
-                return Sse41.Insert(result, CreateScalarUnsafe(e3), 0x30);                          // < 0, 1, 2, 3 >
-            }
-
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse.IsSupported)
             {
-                Vector128<float> lo64, hi64;
-                lo64 = Sse.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1));               // < 0, 1, ?, ? >
-                hi64 = Sse.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3));               // < 2, 3, ?, ? >
-                return Sse.MoveLowToHigh(lo64, hi64);                                               // < 0, 1, 2, 3 >
+                return Create(e0, e1, e2, e3);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<float> result = CreateScalarUnsafe(e0);
@@ -1154,6 +907,7 @@ public static unsafe Vector128<float> Create(float e0, float e1, float e2, float
                 result = AdvSimd.Insert(result, 2, e2);
                 return AdvSimd.Insert(result, 3, e3);
             }
+#endif
 
             return SoftwareFallback(e0, e1, e2, e3);
 
@@ -1182,22 +936,16 @@ static Vector128<float> SoftwareFallback(float e0, float e1, float e2, float e3)
         /// <param name="e7">The value that element 7 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi16</remarks>
         /// <returns>A new <see cref="Vector128{UInt16}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector128<ushort> Create(ushort e0, ushort e1, ushort e2, ushort e3, ushort e4, ushort e5, ushort e6, ushort e7)
         {
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.IsSupported)
             {
-                Vector128<ushort> result = CreateScalarUnsafe(e0);                                  // < 0, ?, ?, ?, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e1, 1);                                                // < 0, 1, ?, ?, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e2, 2);                                                // < 0, 1, 2, ?, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e3, 3);                                                // < 0, 1, 2, 3, ?, ?, ?, ? >
-                result = Sse2.Insert(result, e4, 4);                                                // < 0, 1, 2, 3, 4, ?, ?, ? >
-                result = Sse2.Insert(result, e5, 5);                                                // < 0, 1, 2, 3, 4, 5, ?, ? >
-                result = Sse2.Insert(result, e6, 6);                                                // < 0, 1, 2, 3, 4, 5, 6, ? >
-                return Sse2.Insert(result, e7, 7);                                                  // < 0, 1, 2, 3, 4, 5, 6, 7 >
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<ushort> result = CreateScalarUnsafe(e0);
@@ -1209,6 +957,7 @@ public static unsafe Vector128<ushort> Create(ushort e0, ushort e1, ushort e2, u
                 result = AdvSimd.Insert(result, 6, e6);
                 return AdvSimd.Insert(result, 7, e7);
             }
+#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
 
@@ -1237,29 +986,16 @@ static Vector128<ushort> SoftwareFallback(ushort e0, ushort e1, ushort e2, ushor
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi32</remarks>
         /// <returns>A new <see cref="Vector128{UInt32}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector128<uint> Create(uint e0, uint e1, uint e2, uint e3)
         {
-            if (Sse41.IsSupported)
-            {
-                Vector128<uint> result = CreateScalarUnsafe(e0);                                    // < 0, ?, ?, ? >
-                result = Sse41.Insert(result, e1, 1);                                               // < 0, 1, ?, ? >
-                result = Sse41.Insert(result, e2, 2);                                               // < 0, 1, 2, ? >
-                return Sse41.Insert(result, e3, 3);                                                 // < 0, 1, 2, 3 >
-            }
-
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.IsSupported)
             {
-                // We deal with the elements in order, unpacking the ordered pairs of int into vectors. We then treat those vectors as ulong and
-                // unpack them again. This efficiently gets all ints ordered into the result.
-
-                Vector128<ulong> lo64, hi64;
-                lo64 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsUInt64();   // < 0, 1, ?, ? >
-                hi64 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsUInt64();   // < 2, 3, ?, ? >
-                return Sse2.UnpackLow(lo64, hi64).AsUInt32();                                       // < 0, 1, 2, 3 >
+                return Create(e0, e1, e2, e3);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<uint> result = CreateScalarUnsafe(e0);
@@ -1267,6 +1003,7 @@ public static unsafe Vector128<uint> Create(uint e0, uint e1, uint e2, uint e3)
                 result = AdvSimd.Insert(result, 2, e2);
                 return AdvSimd.Insert(result, 3, e3);
             }
+#endif
 
             return SoftwareFallback(e0, e1, e2, e3);
 
@@ -1289,26 +1026,22 @@ static Vector128<uint> SoftwareFallback(uint e0, uint e1, uint e2, uint e3)
         /// <param name="e1">The value that element 1 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m128i _mm_setr_epi64x</remarks>
         /// <returns>A new <see cref="Vector128{UInt64}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector128<ulong> Create(ulong e0, ulong e1)
         {
-            if (Sse41.X64.IsSupported)
-            {
-                Vector128<ulong> result = CreateScalarUnsafe(e0);                                   // < 0, ? >
-                return Sse41.X64.Insert(result, e1, 1);                                             // < 0, 1 >
-            }
-
+#if !TARGET_ARM && !TARGET_ARM64
             if (Sse2.X64.IsSupported)
             {
-                return Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1));              // < 0, 1 >
+                return Create(e0, e1);
             }
-
+#else
             if (AdvSimd.IsSupported)
             {
                 Vector128<ulong> result = CreateScalarUnsafe(e0);
                 return AdvSimd.Insert(result, 1, e1);
             }
+#endif
 
             return SoftwareFallback(e0, e1);
 
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
index 2a123ab52099c33faf83e9e18015105ee71e5416..ae4e95608b033624fea69f77e2edc603dd9b5b65 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
@@ -208,19 +208,12 @@ public static Vector<T> AsVector<T>(this Vector256<T> value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi8</remarks>
         /// <returns>A new <see cref="Vector256{Byte}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<byte> Create(byte value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<byte> result = Vector128.CreateScalarUnsafe(value);           // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<byte> result = Vector128.Create(value);                       // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -271,19 +264,12 @@ static Vector256<byte> SoftwareFallback(byte value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256d _mm256_set1_pd</remarks>
         /// <returns>A new <see cref="Vector256{Double}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<double> Create(double value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<double> result = Vector128.CreateScalarUnsafe(value);         // < v, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<double> result = Vector128.Create(value);                     // < v, v, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -306,19 +292,12 @@ static Vector256<double> SoftwareFallback(double value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi16</remarks>
         /// <returns>A new <see cref="Vector256{Int16}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<short> Create(short value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<short> result = Vector128.CreateScalarUnsafe(value);          // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<short> result = Vector128.Create(value);                      // < v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -353,19 +332,12 @@ static Vector256<short> SoftwareFallback(short value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi32</remarks>
         /// <returns>A new <see cref="Vector256{Int32}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<int> Create(int value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<int> result = Vector128.CreateScalarUnsafe(value);            // < v, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v, v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<int> result = Vector128.Create(value);                        // < v, v, v, v, ?, ?, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -392,21 +364,12 @@ static Vector256<int> SoftwareFallback(int value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi64x</remarks>
         /// <returns>A new <see cref="Vector256{Int64}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<long> Create(long value)
         {
-            if (Sse2.X64.IsSupported)
+            if (Sse2.X64.IsSupported && Avx.IsSupported)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector128<long> result = Vector128.CreateScalarUnsafe(value);           // < v, ?, ?, ? >
-                    return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v >
-                }
-                else if (Avx.IsSupported)
-                {
-                    Vector128<long> result = Vector128.Create(value);                       // < v, v, ?, ? >
-                    return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v >
-                }
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -429,20 +392,13 @@ static Vector256<long> SoftwareFallback(long value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi8</remarks>
         /// <returns>A new <see cref="Vector256{SByte}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<sbyte> Create(sbyte value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<sbyte> result = Vector128.CreateScalarUnsafe(value);          // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<sbyte> result = Vector128.Create(value);                      // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -493,19 +449,12 @@ static Vector256<sbyte> SoftwareFallback(sbyte value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256 _mm256_set1_ps</remarks>
         /// <returns>A new <see cref="Vector256{Single}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<float> Create(float value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<float> result = Vector128.CreateScalarUnsafe(value);          // < v, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v, v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<float> result = Vector128.Create(value);                      // < v, v, v, v, ?, ?, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -532,20 +481,13 @@ static Vector256<float> SoftwareFallback(float value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi16</remarks>
         /// <returns>A new <see cref="Vector256{UInt16}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<ushort> Create(ushort value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<ushort> result = Vector128.CreateScalarUnsafe(value);         // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<ushort> result = Vector128.Create(value);                     // < v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -580,20 +522,13 @@ static Vector256<ushort> SoftwareFallback(ushort value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi32</remarks>
         /// <returns>A new <see cref="Vector256{UInt32}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<uint> Create(uint value)
         {
-            if (Avx2.IsSupported)
-            {
-                Vector128<uint> result = Vector128.CreateScalarUnsafe(value);           // < v, ?, ?, ?, ?, ?, ?, ? >
-                return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v, v, v, v, v >
-            }
-
             if (Avx.IsSupported)
             {
-                Vector128<uint> result = Vector128.Create(value);                       // < v, v, v, v, ?, ?, ?, ? >
-                return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v, v, v, v, v >
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -620,22 +555,13 @@ static Vector256<uint> SoftwareFallback(uint value)
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_set1_epi64x</remarks>
         /// <returns>A new <see cref="Vector256{UInt64}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<ulong> Create(ulong value)
         {
-            if (Sse2.X64.IsSupported)
+            if (Sse2.X64.IsSupported && Avx.IsSupported)
             {
-                if (Avx2.IsSupported)
-                {
-                    Vector128<ulong> result = Vector128.CreateScalarUnsafe(value);          // < v, ?, ?, ? >
-                    return Avx2.BroadcastScalarToVector256(result);                         // < v, v, v, v >
-                }
-                else if (Avx.IsSupported)
-                {
-                    Vector128<ulong> result = Vector128.Create(value);                      // < v, v, ?, ? >
-                    return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1);      // < v, v, v, v >
-                }
+                return Create(value);
             }
 
             return SoftwareFallback(value);
@@ -689,14 +615,12 @@ static Vector256<ulong> SoftwareFallback(ulong value)
         /// <param name="e31">The value that element 31 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi8</remarks>
         /// <returns>A new <see cref="Vector256{Byte}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<byte> Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7, byte e8, byte e9, byte e10, byte e11, byte e12, byte e13, byte e14, byte e15, byte e16, byte e17, byte e18, byte e19, byte e20, byte e21, byte e22, byte e23, byte e24, byte e25, byte e26, byte e27, byte e28, byte e29, byte e30, byte e31)
         {
             if (Avx.IsSupported)
             {
-                Vector128<byte> lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
-                Vector128<byte> hi128 = Vector128.Create(e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31);
             }
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31);
@@ -750,14 +674,12 @@ static Vector256<byte> SoftwareFallback(byte e0, byte e1, byte e2, byte e3, byte
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256d _mm256_setr_pd</remarks>
         /// <returns>A new <see cref="Vector256{Double}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<double> Create(double e0, double e1, double e2, double e3)
         {
             if (Avx.IsSupported)
             {
-                Vector128<double> lo128 = Vector128.Create(e0, e1);
-                Vector128<double> hi128 = Vector128.Create(e2, e3);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3);
             }
 
             return SoftwareFallback(e0, e1, e2, e3);
@@ -795,14 +717,12 @@ static Vector256<double> SoftwareFallback(double e0, double e1, double e2, doubl
         /// <param name="e15">The value that element 15 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi16</remarks>
         /// <returns>A new <see cref="Vector256{Int16}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<short> Create(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15)
         {
             if (Avx.IsSupported)
             {
-                Vector128<short> lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7);
-                Vector128<short> hi128 = Vector128.Create(e8, e9, e10, e11, e12, e13, e14, e15);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
             }
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
@@ -844,14 +764,12 @@ static Vector256<short> SoftwareFallback(short e0, short e1, short e2, short e3,
         /// <param name="e7">The value that element 7 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi32</remarks>
         /// <returns>A new <see cref="Vector256{Int32}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<int> Create(int e0, int e1, int e2, int e3, int e4, int e5, int e6, int e7)
         {
             if (Avx.IsSupported)
             {
-                Vector128<int> lo128 = Vector128.Create(e0, e1, e2, e3);
-                Vector128<int> hi128 = Vector128.Create(e4, e5, e6, e7);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
@@ -881,14 +799,12 @@ static Vector256<int> SoftwareFallback(int e0, int e1, int e2, int e3, int e4, i
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi64x</remarks>
         /// <returns>A new <see cref="Vector256{Int64}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<long> Create(long e0, long e1, long e2, long e3)
         {
             if (Sse2.X64.IsSupported && Avx.IsSupported)
             {
-                Vector128<long> lo128 = Vector128.Create(e0, e1);
-                Vector128<long> hi128 = Vector128.Create(e2, e3);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3);
             }
 
             return SoftwareFallback(e0, e1, e2, e3);
@@ -942,15 +858,13 @@ static Vector256<long> SoftwareFallback(long e0, long e1, long e2, long e3)
         /// <param name="e31">The value that element 31 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi8</remarks>
         /// <returns>A new <see cref="Vector256{SByte}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<sbyte> Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7, sbyte e8, sbyte e9, sbyte e10, sbyte e11, sbyte e12, sbyte e13, sbyte e14, sbyte e15, sbyte e16, sbyte e17, sbyte e18, sbyte e19, sbyte e20, sbyte e21, sbyte e22, sbyte e23, sbyte e24, sbyte e25, sbyte e26, sbyte e27, sbyte e28, sbyte e29, sbyte e30, sbyte e31)
         {
             if (Avx.IsSupported)
             {
-                Vector128<sbyte> lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
-                Vector128<sbyte> hi128 = Vector128.Create(e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31);
             }
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31);
@@ -1008,14 +922,12 @@ static Vector256<sbyte> SoftwareFallback(sbyte e0, sbyte e1, sbyte e2, sbyte e3,
         /// <param name="e7">The value that element 7 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256 _mm256_setr_ps</remarks>
         /// <returns>A new <see cref="Vector256{Single}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         public static unsafe Vector256<float> Create(float e0, float e1, float e2, float e3, float e4, float e5, float e6, float e7)
         {
             if (Avx.IsSupported)
             {
-                Vector128<float> lo128 = Vector128.Create(e0, e1, e2, e3);
-                Vector128<float> hi128 = Vector128.Create(e4, e5, e6, e7);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
@@ -1057,15 +969,13 @@ static Vector256<float> SoftwareFallback(float e0, float e1, float e2, float e3,
         /// <param name="e15">The value that element 15 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi16</remarks>
         /// <returns>A new <see cref="Vector256{UInt16}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<ushort> Create(ushort e0, ushort e1, ushort e2, ushort e3, ushort e4, ushort e5, ushort e6, ushort e7, ushort e8, ushort e9, ushort e10, ushort e11, ushort e12, ushort e13, ushort e14, ushort e15)
         {
             if (Avx.IsSupported)
             {
-                Vector128<ushort> lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7);
-                Vector128<ushort> hi128 = Vector128.Create(e8, e9, e10, e11, e12, e13, e14, e15);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
             }
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
@@ -1107,15 +1017,13 @@ static Vector256<ushort> SoftwareFallback(ushort e0, ushort e1, ushort e2, ushor
         /// <param name="e7">The value that element 7 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi32</remarks>
         /// <returns>A new <see cref="Vector256{UInt32}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<uint> Create(uint e0, uint e1, uint e2, uint e3, uint e4, uint e5, uint e6, uint e7)
         {
             if (Avx.IsSupported)
             {
-                Vector128<uint> lo128 = Vector128.Create(e0, e1, e2, e3);
-                Vector128<uint> hi128 = Vector128.Create(e4, e5, e6, e7);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
@@ -1145,15 +1053,13 @@ static Vector256<uint> SoftwareFallback(uint e0, uint e1, uint e2, uint e3, uint
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m256i _mm256_setr_epi64x</remarks>
         /// <returns>A new <see cref="Vector256{UInt64}" /> with each element initialized to corresponding specified value.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector256<ulong> Create(ulong e0, ulong e1, ulong e2, ulong e3)
         {
             if (Sse2.X64.IsSupported && Avx.IsSupported)
             {
-                Vector128<ulong> lo128 = Vector128.Create(e0, e1);
-                Vector128<ulong> hi128 = Vector128.Create(e2, e3);
-                return Create(lo128, hi128);
+                return Create(e0, e1, e2, e3);
             }
 
             return SoftwareFallback(e0, e1, e2, e3);