6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14.

Summary: Use hardware DIV instruction for long division by constant when it is faster than code with multiply. Reviewed-by: never

6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14.
Summary: Use hardware DIV instruction for long division by constant when it is faster than code with multiply. Reviewed-by: never
781a22bc · kvn · 764ce4b7 · 781a22bc · 781a22bc · 781a22bc
10 changed file
--- a/src/cpu/sparc/vm/sparc.ad
+++ b/src/cpu/sparc/vm/sparc.ad
@@ -1843,6 +1843,12 @@ bool Matcher::is_spillable_arg( int reg ) {
  return can_be_java_arg(reg);
 }

+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  // Use hardware SDIVX instruction when it is
+  // faster than a code which use multiply.
+  return VM_Version::has_fast_idiv();
+}
+
 // Register for DIVI projection of divmodI
 RegMask Matcher::divI_proj_mask() {
  ShouldNotReachHere();

--- a/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -80,7 +80,8 @@ void VM_Version::initialize() {
      FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
    }
    if (is_niagara1_plus()) {
-      if (AllocatePrefetchStyle > 0 && FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
+      if (has_blk_init() && AllocatePrefetchStyle > 0 &&
+          FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
        // Use BIS instruction for allocation prefetch.
        FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
        if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
@@ -118,16 +119,18 @@ void VM_Version::initialize() {
 #endif

  char buf[512];
-  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               (has_v8() ? ", has_v8" : ""),
               (has_v9() ? ", has_v9" : ""),
               (has_hardware_popc() ? ", popc" : ""),
               (has_vis1() ? ", has_vis1" : ""),
               (has_vis2() ? ", has_vis2" : ""),
+               (has_blk_init() ? ", has_blk_init" : ""),
               (is_ultra3() ? ", is_ultra3" : ""),
               (is_sun4v() ? ", is_sun4v" : ""),
               (is_niagara1() ? ", is_niagara1" : ""),
               (is_niagara1_plus() ? ", is_niagara1_plus" : ""),
+               (is_sparc64() ? ", is_sparc64" : ""),
               (!has_hardware_mul32() ? ", no-mul32" : ""),
               (!has_hardware_div32() ? ", no-div32" : ""),
               (!has_hardware_fsmuld() ? ", no-fsmuld" : ""));

--- a/src/cpu/sparc/vm/vm_version_sparc.hpp
+++ b/src/cpu/sparc/vm/vm_version_sparc.hpp
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -33,7 +33,9 @@ protected:
    v9_instructions    = 5,
    vis1_instructions  = 6,
    vis2_instructions  = 7,
-    sun4v_instructions = 8
+    sun4v_instructions = 8,
+    blk_init_instructions = 9,
+    fmaf_instructions  = 10
  };

  enum Feature_Flag_Set {
@@ -49,6 +51,8 @@ protected:
    vis1_instructions_m = 1 << vis1_instructions,
    vis2_instructions_m = 1 << vis2_instructions,
    sun4v_m             = 1 << sun4v_instructions,
+    blk_init_instructions_m = 1 << blk_init_instructions,
+    fmaf_instructions_m = 1 << fmaf_instructions,

    generic_v8_m        = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
    generic_v9_m        = generic_v8_m | v9_instructions_m,
@@ -67,6 +71,7 @@ protected:
  static int  platform_features(int features);

  static bool is_niagara1(int features) { return (features & sun4v_m) != 0; }
+  static bool  is_sparc64(int features) { return (features & fmaf_instructions_m) != 0; }

  static int maximum_niagara1_processor_count() { return 32; }
  // Returns true if the platform is in the niagara line and
@@ -86,6 +91,7 @@ public:
  static bool has_hardware_popc()       { return (_features & hardware_popc_m) != 0; }
  static bool has_vis1()                { return (_features & vis1_instructions_m) != 0; }
  static bool has_vis2()                { return (_features & vis2_instructions_m) != 0; }
+  static bool has_blk_init()            { return (_features & blk_init_instructions_m) != 0; }

  static bool supports_compare_and_exchange()
                                        { return has_v9(); }
@@ -93,8 +99,10 @@ public:
  static bool is_ultra3()               { return (_features & ultra3_m) == ultra3_m; }
  static bool is_sun4v()                { return (_features & sun4v_m) != 0; }
  static bool is_niagara1()             { return is_niagara1(_features); }
+  static bool is_sparc64()              { return is_sparc64(_features); }

  static bool has_fast_fxtof()          { return has_v9() && !is_ultra3(); }
+  static bool has_fast_idiv()           { return is_niagara1_plus() || is_sparc64(); }

  static const char* cpu_features()     { return _features_str; }


--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -1288,7 +1288,7 @@ void Assembler::imull(Register dst, Register src, int value) {
  if (is8bit(value)) {
    emit_byte(0x6B);
    emit_byte(0xC0 | encode);
-    emit_byte(value);
+    emit_byte(value & 0xFF);
  } else {
    emit_byte(0x69);
    emit_byte(0xC0 | encode);
@@ -3903,7 +3903,7 @@ void Assembler::imulq(Register dst, Register src, int value) {
  if (is8bit(value)) {
    emit_byte(0x6B);
    emit_byte(0xC0 | encode);
-    emit_byte(value);
+    emit_byte(value & 0xFF);
  } else {
    emit_byte(0x69);
    emit_byte(0xC0 | encode);

--- a/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/src/cpu/x86/vm/vm_version_x86.hpp
@@ -446,6 +446,10 @@ public:
  static bool supports_lzcnt()    { return (_cpuFeatures & CPU_LZCNT) != 0; }
  static bool supports_sse4a()    { return (_cpuFeatures & CPU_SSE4A) != 0; }

+  // Intel Core and newer cpus have fast IDIV instruction (excluding Atom).
+  static bool has_fast_idiv()     { return is_intel() && cpu_family() == 6 &&
+                                           supports_sse3() && _model != 0x1C; }
+
  static bool supports_compare_and_exchange() { return true; }

  static const char* cpu_features()           { return _features_str; }

--- a/src/cpu/x86/vm/x86_32.ad
+++ b/src/cpu/x86/vm/x86_32.ad
@@ -1508,6 +1508,16 @@ bool Matcher::is_spillable_arg( int reg ) {
  return can_be_java_arg(reg);
 }

+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  // Use hardware integer DIV instruction when
+  // it is faster than a code which use multiply.
+  // Only when constant divisor fits into 32 bit
+  // (min_jint is excluded to get only correct
+  // positive 32 bit values from negative).
+  return VM_Version::has_fast_idiv() &&
+         (divisor == (int)divisor && divisor != min_jint);
+}
+
 // Register for DIVI projection of divmodI
 RegMask Matcher::divI_proj_mask() {
  return EAX_REG_mask;
@@ -1546,6 +1556,9 @@ bool is_operand_hi32_zero(Node* n) {
      return true;
    }
  }
+  if (opc == Op_ConL && (n->get_long() & 0xFFFFFFFF00000000LL) == 0LL) {
+    return true;
+  }
  return false;
 }

@@ -2309,9 +2322,11 @@ encode %{
  enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{
    emit_opcode( cbuf, 0x8B ); // Move
    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
-    emit_d8(cbuf,$primary);
-    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
-    emit_d8(cbuf,$cnt$$constant-32);
+    if( $cnt$$constant > 32 ) { // Shift, if not by zero
+      emit_d8(cbuf,$primary);
+      emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
+      emit_d8(cbuf,$cnt$$constant-32);
+    }
    emit_d8(cbuf,$primary);
    emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg));
    emit_d8(cbuf,31);
@@ -8842,6 +8857,103 @@ instruct modL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI
  ins_pipe( pipe_slow );
 %}

+// Divide Register Long (no special case since divisor != -1)
+instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+  match(Set dst (DivL dst imm));
+  effect( TEMP tmp, TEMP tmp2, KILL cr );
+  ins_cost(1000);
+  format %{ "MOV    $tmp,abs($imm) # ldiv EDX:EAX,$imm\n\t"
+            "CMP    $tmp,EDX\n\t"
+            "JA,s   fast\n\t"
+            "MOV    $tmp2,EAX\n\t"
+            "MOV    EAX,EDX\n\t"
+            "SAR    EDX,31\n\t"
+            "IDIV   $tmp\n\t"
+            "XCHG   EAX,$tmp2 \n\t"
+            "IDIV   $tmp\n\t"
+            "CDQ\n\t"
+            "ADD    EDX,$tmp2\n\t"
+            "JMP,s  done\n"
+    "fast:\n\t"
+            "IDIV   $tmp\n\t"
+            "XOR    EDX,EDX\n"
+    "done:\n\t"
+            "NEG    EDX:EAX # if $imm < 0" %}
+  ins_encode %{
+    int con = (int)$imm$$constant;
+    assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
+    int pcon = (con > 0) ? con : -con;
+    Label Lfast, Ldone;
+
+    __ movl($tmp$$Register, pcon);
+    __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ jccb(Assembler::above, Lfast);
+
+    __ movl($tmp2$$Register, $dst$$Register); // save
+    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
+    __ idivl($tmp$$Register);
+    __ xchgl($dst$$Register, $tmp2$$Register);
+    __ idivl($tmp$$Register);
+    __ cdql();
+    __ addl(HIGH_FROM_LOW($dst$$Register),$tmp2$$Register);
+    __ jmpb(Ldone);
+
+    __ bind(Lfast);
+    // fast path: src is positive and result fits into 32 bit
+    __ idivl($tmp$$Register);
+    __ xorl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+
+    __ bind(Ldone);
+    if (con < 0) {
+      __ lneg(HIGH_FROM_LOW($dst$$Register), $dst$$Register);
+    }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Remainder Register Long (remainder fit into 32 bits)
+instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+  match(Set dst (ModL dst imm));
+  effect( TEMP tmp, TEMP tmp2, KILL cr );
+  ins_cost(1000);
+  format %{ "MOV    $tmp,abs($imm) # lrem EDX:EAX,$imm\n\t"
+            "CMP    $tmp,EDX\n\t"
+            "JA,s   fast\n\t"
+            "MOV    $tmp2,EAX\n\t"
+            "MOV    EAX,EDX\n\t"
+            "SAR    EDX,31\n\t"
+            "IDIV   $tmp\n\t"
+            "MOV    EAX,$tmp2\n"
+    "fast:\n\t"
+            "IDIV   $tmp\n\t"
+            "MOV    EAX,EDX\n\t"
+            "SAR    EDX,31\n\t" %}
+  ins_encode %{
+    int con = (int)$imm$$constant;
+    assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
+    int pcon = (con > 0) ? con : -con;
+    Label  Lfast;
+
+    __ movl($tmp$$Register, pcon);
+    __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ jccb(Assembler::above, Lfast); // src is positive and result fits into 32 bit
+
+    __ movl($tmp2$$Register, $dst$$Register); // save
+    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
+    __ idivl($tmp$$Register);
+    __ movl($dst$$Register, $tmp2$$Register);
+
+    __ bind(Lfast);
+    __ idivl($tmp$$Register);
+    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // result sign
+
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Integer Shift Instructions
 // Shift Left by one
 instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{

--- a/src/cpu/x86/vm/x86_64.ad
+++ b/src/cpu/x86/vm/x86_64.ad
@@ -2065,6 +2065,13 @@ bool Matcher::is_spillable_arg(int reg)
  return can_be_java_arg(reg);
 }

+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  // In 64 bit mode a code which use multiply when
+  // devisor is constant is faster than hardware
+  // DIV instruction (it uses MulHiL).
+  return false;
+}
+
 // Register for DIVI projection of divmodI
 RegMask Matcher::divI_proj_mask() {
  return INT_RAX_REG_mask;

--- a/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp
+++ b/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp
 /*
- * Copyright (c) 2006, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -65,10 +65,6 @@ int VM_Version::platform_features(int features) {
  // getisax(2), SI_ARCHITECTURE_32, and SI_ARCHITECTURE_64 are
  // supported on Solaris 10 and later.
  if (os::Solaris::supports_getisax()) {
-#ifndef PRODUCT
-    if (PrintMiscellaneous && Verbose)
-      tty->print_cr("getisax(2) supported.");
-#endif

    // Check 32-bit architecture.
    do_sysinfo(SI_ARCHITECTURE_32, "sparc", &features, v8_instructions_m);
@@ -81,6 +77,11 @@ int VM_Version::platform_features(int features) {
    uint_t avn = os::Solaris::getisax(&av, 1);
    assert(avn == 1, "should only return one av");

+#ifndef PRODUCT
+    if (PrintMiscellaneous && Verbose)
+      tty->print_cr("getisax(2) returned: " PTR32_FORMAT, av);
+#endif
+
    if (av & AV_SPARC_MUL32)  features |= hardware_mul32_m;
    if (av & AV_SPARC_DIV32)  features |= hardware_div32_m;
    if (av & AV_SPARC_FSMULD) features |= hardware_fsmuld_m;
@@ -88,11 +89,22 @@ int VM_Version::platform_features(int features) {
    if (av & AV_SPARC_POPC)   features |= hardware_popc_m;
    if (av & AV_SPARC_VIS)    features |= vis1_instructions_m;
    if (av & AV_SPARC_VIS2)   features |= vis2_instructions_m;
+
+    // Next values are not defined before Solaris 10
+    // but Solaris 8 is used for jdk6 update builds.
+#ifndef AV_SPARC_ASI_BLK_INIT
+#define AV_SPARC_ASI_BLK_INIT 0x0080  /* ASI_BLK_INIT_xxx ASI */
+#endif
+#ifndef AV_SPARC_FMAF
+#define AV_SPARC_FMAF 0x0100  /* Sparc64 Fused Multiply-Add */
+#endif
+    if (av & AV_SPARC_ASI_BLK_INIT) features |= blk_init_instructions_m;
+    if (av & AV_SPARC_FMAF)         features |= fmaf_instructions_m;
  } else {
    // getisax(2) failed, use the old legacy code.
 #ifndef PRODUCT
    if (PrintMiscellaneous && Verbose)
-      tty->print_cr("getisax(2) not supported.");
+      tty->print_cr("getisax(2) is not supported.");
 #endif

    char   tmp;

--- a/src/share/vm/opto/divnode.cpp
+++ b/src/share/vm/opto/divnode.cpp
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -388,7 +388,8 @@ static Node *transform_long_divide( PhaseGVN *phase, Node *dividend, jlong divis
    if (!d_pos) {
      q = new (phase->C, 3) SubLNode(phase->longcon(0), phase->transform(q));
    }
-  } else {
+  } else if ( !Matcher::use_asm_for_ldiv_by_con(d) ) { // Use hardware DIV instruction when
+                                                       // it is faster than code generated below.
    // Attempt the jlong constant divide -> multiply transform found in
    //   "Division by Invariant Integers using Multiplication"
    //     by Granlund and Montgomery
@@ -558,7 +559,7 @@ Node *DivLNode::Ideal( PhaseGVN *phase, bool can_reshape) {

  set_req(0,NULL);              // Dividing by a not-zero constant; no faulting

-  // Dividing by MININT does not optimize as a power-of-2 shift.
+  // Dividing by MINLONG does not optimize as a power-of-2 shift.
  if( l == min_jlong ) return NULL;

  return transform_long_divide( phase, in(1), l );
@@ -1062,7 +1063,7 @@ Node *ModLNode::Ideal(PhaseGVN *phase, bool can_reshape) {
  // Fell thru, the unroll case is not appropriate. Transform the modulo
  // into a long multiply/int multiply/subtract case

-  // Cannot handle mod 0, and min_jint isn't handled by the transform
+  // Cannot handle mod 0, and min_jlong isn't handled by the transform
  if( con == 0 || con == min_jlong ) return NULL;

  // Get the absolute value of the constant; at this point, we can use this
@@ -1075,7 +1076,7 @@ Node *ModLNode::Ideal(PhaseGVN *phase, bool can_reshape) {

  // If this is a power of two, then maybe we can mask it
  if( is_power_of_2_long(pos_con) ) {
-    log2_con = log2_long(pos_con);
+    log2_con = exact_log2_long(pos_con);

    const Type *dt = phase->type(in(1));
    const TypeLong *dtl = dt->isa_long();
@@ -1088,7 +1089,7 @@ Node *ModLNode::Ideal(PhaseGVN *phase, bool can_reshape) {
  // Save in(1) so that it cannot be changed or deleted
  hook->init_req(0, in(1));

-  // Divide using the transform from DivI to MulL
+  // Divide using the transform from DivL to MulL
  Node *result = transform_long_divide( phase, in(1), pos_con );
  if (result != NULL) {
    Node *divide = phase->transform(result);

--- a/src/share/vm/opto/matcher.hpp
+++ b/src/share/vm/opto/matcher.hpp
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -298,6 +298,10 @@ public:
  // Register for MODL projection of divmodL
  static RegMask modL_proj_mask();

+  // Use hardware DIV instruction when it is faster than
+  // a code which use multiply for division by constant.
+  static bool use_asm_for_ldiv_by_con( jlong divisor );
+
  static const RegMask method_handle_invoke_SP_save_mask();

  // Java-Interpreter calling convention