rustc: Prepare to enable ThinLTO by default

This commit prepares to enable ThinLTO and multiple codegen units in release mode by default. We've still got a debuginfo bug or two to sort out before actually turning it on by default.

rustc: Prepare to enable ThinLTO by default
This commit prepares to enable ThinLTO and multiple codegen units in release mode by default. We've still got a debuginfo bug or two to sort out before actually turning it on by default.
855f6d14 · Alex Crichton · 7df4683c · 855f6d14 · 855f6d14 · 855f6d14
8 changed file
--- a/src/librustc/session/config.rs
+++ b/src/librustc/session/config.rs
@@ -383,8 +383,13 @@ pub struct Options {
        // try to not rely on this too much.
        actually_rustdoc: bool [TRACKED],

-        // Number of object files/codegen units to produce on the backend
+        // Specifications of codegen units / ThinLTO which are forced as a
+        // result of parsing command line options. These are not necessarily
+        // what rustc was invoked with, but massaged a bit to agree with
+        // commands like `--emit llvm-ir` which they're often incompatible with
+        // if we otherwise use the defaults of rustc.
        cli_forced_codegen_units: Option<usize> [UNTRACKED],
+        cli_forced_thinlto: Option<bool> [UNTRACKED],
    }
 );

@@ -566,6 +571,7 @@ pub fn basic_options() -> Options {
        debug_assertions: true,
        actually_rustdoc: false,
        cli_forced_codegen_units: None,
+        cli_forced_thinlto: None,
    }
 }

@@ -1165,7 +1171,7 @@ fn parse_optimization_fuel(slot: &mut Option<(String, u64)>, v: Option<&str>) ->
                 "run the non-lexical lifetimes MIR pass"),
    trans_time_graph: bool = (false, parse_bool, [UNTRACKED],
        "generate a graphical HTML report of time spent in trans and LLVM"),
-    thinlto: bool = (false, parse_bool, [TRACKED],
+    thinlto: Option<bool> = (None, parse_opt_bool, [TRACKED],
        "enable ThinLTO when possible"),
    inline_in_all_cgus: Option<bool> = (None, parse_opt_bool, [TRACKED],
        "control whether #[inline] functions are in all cgus"),
@@ -1601,6 +1607,7 @@ pub fn build_session_options_and_crate_config(matches: &getopts::Matches)

    let mut cg = build_codegen_options(matches, error_format);
    let mut codegen_units = cg.codegen_units;
+    let mut thinlto = None;

    // Issue #30063: if user requests llvm-related output to one
    // particular path, disable codegen-units.
@@ -1622,9 +1629,13 @@ pub fn build_session_options_and_crate_config(matches: &getopts::Matches)
                    }
                    early_warn(error_format, "resetting to default -C codegen-units=1");
                    codegen_units = Some(1);
+                    thinlto = Some(false);
                }
            }
-            _ => codegen_units = Some(1),
+            _ => {
+                codegen_units = Some(1);
+                thinlto = Some(false);
+            }
        }
    }

@@ -1834,6 +1845,7 @@ pub fn build_session_options_and_crate_config(matches: &getopts::Matches)
        debug_assertions,
        actually_rustdoc: false,
        cli_forced_codegen_units: codegen_units,
+        cli_forced_thinlto: thinlto,
    },
    cfg)
 }

--- a/src/librustc/session/mod.rs
+++ b/src/librustc/session/mod.rs
@@ -656,30 +656,91 @@ pub fn codegen_units(&self) -> usize {
            return n as usize
        }

+        // Why is 16 codegen units the default all the time?
+        //
+        // The main reason for enabling multiple codegen units by default is to
+        // leverage the ability for the trans backend to do translation and
+        // codegen in parallel. This allows us, especially for large crates, to
+        // make good use of all available resources on the machine once we've
+        // hit that stage of compilation. Large crates especially then often
+        // take a long time in trans/codegen and this helps us amortize that
+        // cost.
+        //
+        // Note that a high number here doesn't mean that we'll be spawning a
+        // large number of threads in parallel. The backend of rustc contains
+        // global rate limiting through the `jobserver` crate so we'll never
+        // overload the system with too much work, but rather we'll only be
+        // optimizing when we're otherwise cooperating with other instances of
+        // rustc.
+        //
+        // Rather a high number here means that we should be able to keep a lot
+        // of idle cpus busy. By ensuring that no codegen unit takes *too* long
+        // to build we'll be guaranteed that all cpus will finish pretty closely
+        // to one another and we should make relatively optimal use of system
+        // resources
+        //
+        // Note that the main cost of codegen units is that it prevents LLVM
+        // from inlining across codegen units. Users in general don't have a lot
+        // of control over how codegen units are split up so it's our job in the
+        // compiler to ensure that undue performance isn't lost when using
+        // codegen units (aka we can't require everyone to slap `#[inline]` on
+        // everything).
+        //
+        // If we're compiling at `-O0` then the number doesn't really matter too
+        // much because performance doesn't matter and inlining is ok to lose.
+        // In debug mode we just want to try to guarantee that no cpu is stuck
+        // doing work that could otherwise be farmed to others.
+        //
+        // In release mode, however (O1 and above) performance does indeed
+        // matter! To recover the loss in performance due to inlining we'll be
+        // enabling ThinLTO by default (the function for which is just below).
+        // This will ensure that we recover any inlining wins we otherwise lost
+        // through codegen unit partitioning.
+        //
+        // ---
+        //
+        // Ok that's a lot of words but the basic tl;dr; is that we want a high
+        // number here -- but not too high. Additionally we're "safe" to have it
+        // always at the same number at all optimization levels.
+        //
+        // As a result 16 was chosen here! Mostly because it was a power of 2
+        // and most benchmarks agreed it was roughly a local optimum. Not very
+        // scientific.
        match self.opts.optimize {
-            // If we're compiling at `-O0` then default to 16 codegen units.
-            // The number here shouldn't matter too too much as debug mode
-            // builds don't rely on performance at all, meaning that lost
-            // opportunities for inlining through multiple codegen units is
-            // a non-issue.
-            //
-            // Note that the high number here doesn't mean that we'll be
-            // spawning a large number of threads in parallel. The backend
-            // of rustc contains global rate limiting through the
-            // `jobserver` crate so we'll never overload the system with too
-            // much work, but rather we'll only be optimizing when we're
-            // otherwise cooperating with other instances of rustc.
-            //
-            // Rather the high number here means that we should be able to
-            // keep a lot of idle cpus busy. By ensuring that no codegen
-            // unit takes *too* long to build we'll be guaranteed that all
-            // cpus will finish pretty closely to one another and we should
-            // make relatively optimal use of system resources
            config::OptLevel::No => 16,
+            _ => 1, // FIXME(#46346) this should be 16
+        }
+    }

-            // All other optimization levels default use one codegen unit,
-            // the historical default in Rust for a Long Time.
-            _ => 1,
+    /// Returns whether ThinLTO is enabled for this compilation
+    pub fn thinlto(&self) -> bool {
+        // If processing command line options determined that we're incompatible
+        // with ThinLTO (e.g. `-C lto --emit llvm-ir`) then return that option.
+        if let Some(enabled) = self.opts.cli_forced_thinlto {
+            return enabled
+        }
+
+        // If explicitly specified, use that with the next highest priority
+        if let Some(enabled) = self.opts.debugging_opts.thinlto {
+            return enabled
+        }
+
+        // If there's only one codegen unit and LTO isn't enabled then there's
+        // no need for ThinLTO so just return false.
+        if self.codegen_units() == 1 && !self.lto() {
+            return false
+        }
+
+        // Right now ThinLTO isn't compatible with incremental compilation.
+        if self.opts.incremental.is_some() {
+            return false
+        }
+
+        // Now we're in "defaults" territory. By default we enable ThinLTO for
+        // optimized compiles (anything greater than O0).
+        match self.opts.optimize {
+            config::OptLevel::No => false,
+            _ => true,
        }
    }
 }

--- a/src/librustc_trans/back/write.rs
+++ b/src/librustc_trans/back/write.rs
@@ -1402,8 +1402,9 @@ fn start_executing_work(tcx: TyCtxt,
        // for doesn't require full LTO. Some targets require one LLVM module
        // (they effectively don't have a linker) so it's up to us to use LTO to
        // link everything together.
-        thinlto: sess.opts.debugging_opts.thinlto &&
-            !sess.target.target.options.requires_lto,
+        thinlto: sess.thinlto() &&
+            !sess.target.target.options.requires_lto &&
+            unsafe { llvm::LLVMRustThinLTOAvailable() },

        no_landing_pads: sess.no_landing_pads(),
        save_temps: sess.opts.cg.save_temps,

--- a/src/librustc_trans/base.rs
+++ b/src/librustc_trans/base.rs
@@ -706,7 +706,7 @@ pub fn trans_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,

    check_for_rustc_errors_attr(tcx);

-    if tcx.sess.opts.debugging_opts.thinlto {
+    if let Some(true) = tcx.sess.opts.debugging_opts.thinlto {
        if unsafe { !llvm::LLVMRustThinLTOAvailable() } {
            tcx.sess.fatal("this compiler's LLVM does not support ThinLTO");
        }

--- a/src/libstd/sys_common/backtrace.rs
+++ b/src/libstd/sys_common/backtrace.rs
@@ -252,8 +252,26 @@ fn output_fileline(w: &mut Write,
 // Note that this demangler isn't quite as fancy as it could be. We have lots
 // of other information in our symbols like hashes, version, type information,
 // etc. Additionally, this doesn't handle glue symbols at all.
-pub fn demangle(writer: &mut Write, s: &str, format: PrintFormat) -> io::Result<()> {
-    // First validate the symbol. If it doesn't look like anything we're
+pub fn demangle(writer: &mut Write, mut s: &str, format: PrintFormat) -> io::Result<()> {
+    // During ThinLTO LLVM may import and rename internal symbols, so strip out
+    // those endings first as they're one of the last manglings applied to
+    // symbol names.
+    let llvm = ".llvm.";
+    if let Some(i) = s.find(llvm) {
+        let candidate = &s[i + llvm.len()..];
+        let all_hex = candidate.chars().all(|c| {
+            match c {
+                'A' ... 'F' | '0' ... '9' => true,
+                _ => false,
+            }
+        });
+
+        if all_hex {
+            s = &s[..i];
+        }
+    }
+
+    // Validate the symbol. If it doesn't look like anything we're
    // expecting, we just print it literally. Note that we must handle non-rust
    // symbols because we could have any function in the backtrace.
    let mut valid = true;

--- a/src/test/run-fail/mir_trans_no_landing_pads.rs
+++ b/src/test/run-fail/mir_trans_no_landing_pads.rs
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-// compile-flags: -Z no-landing-pads
+// compile-flags: -Z no-landing-pads -C codegen-units=1
 // error-pattern:converging_fn called
 use std::io::{self, Write};


--- a/src/test/run-fail/mir_trans_no_landing_pads_diverging.rs
+++ b/src/test/run-fail/mir_trans_no_landing_pads_diverging.rs
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-// compile-flags: -Z no-landing-pads
+// compile-flags: -Z no-landing-pads -C codegen-units=1
 // error-pattern:diverging_fn called
 use std::io::{self, Write};


--- a/src/test/run-pass/no-landing-pads.rs
+++ b/src/test/run-pass/no-landing-pads.rs
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-// compile-flags: -Z no-landing-pads
+// compile-flags: -Z no-landing-pads -C codegen-units=1
 // ignore-emscripten no threads support

 use std::thread;