diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs
index 1511e25523559..05a7e46d04d89 100644
--- a/compiler/rustc_middle/src/mir/mono.rs
+++ b/compiler/rustc_middle/src/mir/mono.rs
@@ -223,7 +223,7 @@ impl<'tcx> fmt::Display for MonoItem<'tcx> {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct CodegenUnit<'tcx> {
     /// A name for this CGU. Incremental compilation requires that
     /// name be unique amongst **all** crates. Therefore, it should
@@ -236,6 +236,7 @@ pub struct CodegenUnit<'tcx> {
     /// True if this is CGU is used to hold code coverage information for dead code,
     /// false otherwise.
     is_code_coverage_dead_code_cgu: bool,
+    was_merged: bool,
 }
 
 /// Specifies the linkage type for a `MonoItem`.
@@ -272,6 +273,7 @@ impl<'tcx> CodegenUnit<'tcx> {
             size_estimate: None,
             primary: false,
             is_code_coverage_dead_code_cgu: false,
+            was_merged: false,
         }
     }
 
@@ -291,6 +293,14 @@ impl<'tcx> CodegenUnit<'tcx> {
         self.primary = true;
     }
 
+    pub fn was_merged(&self) -> bool {
+        self.was_merged
+    }
+
+    pub fn make_merged(&mut self) {
+        self.was_merged = true;
+    }
+
     /// The order of these items is non-determinstic.
     pub fn items(&self) -> &FxHashMap<MonoItem<'tcx>, (Linkage, Visibility)> {
         &self.items
@@ -411,6 +421,7 @@ impl<'a, 'tcx> HashStable<StableHashingContext<'a>> for CodegenUnit<'tcx> {
             size_estimate: _,
             primary: _,
             is_code_coverage_dead_code_cgu,
+            was_merged: _,
         } = *self;
 
         name.hash_stable(hcx, hasher);
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 531644f0b8490..81fab5f53e3d6 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -309,6 +309,7 @@ fn merge_codegen_units<'tcx>(
     let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
         codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();
 
+    let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
     // Having multiple CGUs can drastically speed up compilation. But for
     // non-incremental builds, tiny CGUs slow down compilation *and* result in
     // worse generated code. So we don't allow CGUs smaller than this (unless
@@ -326,36 +327,59 @@ fn merge_codegen_units<'tcx>(
     // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
     // critical they aren't merged. Also, some tests use explicit small values
     // and likewise won't work if small CGUs are merged.
-    while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
+
+    if codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
         || (cx.tcx.sess.opts.incremental.is_none()
             && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
-            && codegen_units.len() > 1
-            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
+            && codegen_units.len() > 1)
     {
-        // Sort small cgus to the back.
-        codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+        while codegen_units.len() > 1
+            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE)
+        {
+            // Sort small cgus to the back.
+            codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
 
-        let mut smallest = codegen_units.pop().unwrap();
-        let second_smallest = codegen_units.last_mut().unwrap();
+            let Some((mut smallest, second_smallest)) = codegen_units.pop().zip(codegen_units.last_mut()) else { break; };
 
-        // Move the mono-items from `smallest` to `second_smallest`
-        second_smallest.modify_size_estimate(smallest.size_estimate());
-        second_smallest.items_mut().extend(smallest.items_mut().drain());
+            // Move the mono-items from `smallest` to `second_smallest`
+            second_smallest.modify_size_estimate(smallest.size_estimate());
+            second_smallest.items_mut().extend(smallest.items_mut().drain());
 
-        // Record that `second_smallest` now contains all the stuff that was
-        // in `smallest` before.
-        let mut consumed_cgu_names = cgu_contents.remove(&smallest.name()).unwrap();
-        cgu_contents.get_mut(&second_smallest.name()).unwrap().append(&mut consumed_cgu_names);
+            // Record that `second_smallest` now contains all the stuff that was
+            // in `smallest` before.
+            let mut consumed_cgu_names = cgu_contents.remove(&smallest.name()).unwrap();
+            cgu_contents.get_mut(&second_smallest.name()).unwrap().append(&mut consumed_cgu_names);
+        }
 
-        debug!(
-            "CodegenUnit {} merged into CodegenUnit {}",
-            smallest.name(),
-            second_smallest.name()
-        );
+        codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+        let fallback_cgu_name = fallback_cgu_name(cgu_name_builder);
+        let mut default = CodegenUnit::new(fallback_cgu_name);
+        default.create_size_estimate(cx.tcx);
+        let mut merged_subsets: Vec<CodegenUnit<'_>> =
+            vec![default; cx.tcx.sess.codegen_units().as_usize().min(codegen_units.len())];
+
+        codegen_units.iter_mut().for_each(|cgu| {
+            let min = merged_subsets
+                .iter()
+                .enumerate()
+                .min_by_key(|(_, cgu)| cgu.size_estimate())
+                .map(|(i, _)| i)
+                .unwrap_or(0);
+            let min_cgu = &mut merged_subsets[min];
+            if !min_cgu.was_merged() {
+                *min_cgu = std::mem::replace(cgu, CodegenUnit::new(fallback_cgu_name));
+                min_cgu.make_merged();
+            } else {
+                min_cgu.modify_size_estimate(cgu.size_estimate());
+                min_cgu.items_mut().extend(cgu.items_mut().drain());
+                let mut consumed_cgu_names = cgu_contents.remove(&cgu.name()).unwrap();
+                cgu_contents.get_mut(&min_cgu.name()).unwrap().append(&mut consumed_cgu_names);
+                debug!("CodegenUnit {} merged into CodegenUnit {}", cgu.name(), min_cgu.name());
+            }
+        });
+        *codegen_units = merged_subsets;
     }
 
-    let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
-
     if cx.tcx.sess.opts.incremental.is_some() {
         // If we are doing incremental compilation, we want CGU names to
         // reflect the path of the source level module they correspond to.
diff --git a/compiler/rustc_ty_utils/src/ty.rs b/compiler/rustc_ty_utils/src/ty.rs
index fe2d1fba7fe6e..105236400519f 100644
--- a/compiler/rustc_ty_utils/src/ty.rs
+++ b/compiler/rustc_ty_utils/src/ty.rs
@@ -454,12 +454,33 @@ fn instance_def_size_estimate<'tcx>(
     tcx: TyCtxt<'tcx>,
     instance_def: ty::InstanceDef<'tcx>,
 ) -> usize {
+    use rustc_middle::mir::StatementKind;
     use ty::InstanceDef;
-
     match instance_def {
         InstanceDef::Item(..) | InstanceDef::DropGlue(..) => {
             let mir = tcx.instance_mir(instance_def);
-            mir.basic_blocks.iter().map(|bb| bb.statements.len() + 1).sum()
+            mir.basic_blocks
+                .iter()
+                .map(|bb| {
+                    bb.statements
+                        .iter()
+                        .filter(|s| {
+                            // do not count non-codegen statement
+                            !matches!(
+                                s.kind,
+                                StatementKind::Deinit(..)
+                                    | StatementKind::FakeRead(..)
+                                    | StatementKind::Retag { .. }
+                                    | StatementKind::AscribeUserType(..)
+                                    | StatementKind::ConstEvalCounter
+                                    | StatementKind::PlaceMention(..)
+                                    | StatementKind::Nop
+                            )
+                        })
+                        .count()
+                        + 1
+                })
+                .sum()
         }
         // Estimate the size of other compiler-generated shims to be 1.
         _ => 1,