fix: more robustness in the face of a trampling-herd of threads loading a single index.

Byron · Byron · commit 9ca81dfba7f4 · 2024-05-11T21:29:31.000+02:00
The motivating example is here: praetorian-inc/noseyparker#179
diff --git a/gitoxide-core/src/repository/odb.rs b/gitoxide-core/src/repository/odb.rs
@@ -1,4 +1,5 @@
 use std::io;
+use std::sync::atomic::Ordering;
 
 use anyhow::bail;
 
@@ -50,6 +51,8 @@ pub mod statistics {
     pub struct Options {
         pub format: OutputFormat,
         pub thread_limit: Option<usize>,
+        /// A debug-flag that triggers looking up the headers of all objects again, but without indices preloaded
+        pub extra_header_lookup: bool,
     }
 }
 
@@ -59,7 +62,11 @@ pub fn statistics(
     mut progress: impl gix::Progress,
     out: impl io::Write,
     mut err: impl io::Write,
-    statistics::Options { format, thread_limit }: statistics::Options,
+    statistics::Options {
+        format,
+        thread_limit,
+        extra_header_lookup,
+    }: statistics::Options,
 ) -> anyhow::Result<()> {
     use bytesize::ByteSize;
     use gix::odb::{find, HeaderExt};
@@ -76,6 +83,10 @@ pub fn statistics(
     #[cfg_attr(feature = "serde", derive(serde::Serialize))]
     #[derive(Default)]
     struct Statistics {
+        /// All objects that were used to produce these statistics.
+        /// Only `Some` if we are doing an extra round of header queries on a repository without loaded indices.
+        #[cfg_attr(feature = "serde", serde(skip_serializing))]
+        ids: Option<Vec<gix::ObjectId>>,
         total_objects: usize,
         loose_objects: usize,
         packed_objects: usize,
@@ -135,14 +146,17 @@ pub fn statistics(
     }
 
     impl gix::parallel::Reduce for Reduce {
-        type Input = Result<Vec<gix::odb::find::Header>, anyhow::Error>;
+        type Input = Result<Vec<(gix::ObjectId, gix::odb::find::Header)>, anyhow::Error>;
         type FeedProduce = ();
         type Output = Statistics;
         type Error = anyhow::Error;
 
         fn feed(&mut self, items: Self::Input) -> Result<Self::FeedProduce, Self::Error> {
-            for item in items? {
+            for (id, item) in items? {
                 self.stats.consume(item);
+                if let Some(ids) = self.stats.ids.as_mut() {
+                    ids.push(id);
+                }
             }
             Ok(())
         }
@@ -154,9 +168,9 @@ pub fn statistics(
     }
 
     let cancelled = || anyhow::anyhow!("Cancelled by user");
-    let object_ids = repo.objects.store_ref().iter()?.filter_map(Result::ok);
+    let object_ids = repo.objects.iter()?.filter_map(Result::ok);
     let chunk_size = 1_000;
-    let stats = if gix::parallel::num_threads(thread_limit) > 1 {
+    let mut stats = if gix::parallel::num_threads(thread_limit) > 1 {
         gix::parallel::in_parallel(
             gix::interrupt::Iter::new(
                 gix::features::iter::Chunks {
@@ -166,19 +180,30 @@ pub fn statistics(
                 cancelled,
             ),
             thread_limit,
-            move |_| (repo.objects.clone().into_inner(), counter),
+            {
+                let objects = repo.objects.clone();
+                move |_| (objects.clone().into_inner(), counter)
+            },
             |ids, (handle, counter)| {
                 let ids = ids?;
-                counter.fetch_add(ids.len(), std::sync::atomic::Ordering::Relaxed);
+                counter.fetch_add(ids.len(), Ordering::Relaxed);
                 let out = ids
                     .into_iter()
-                    .map(|id| handle.header(id))
+                    .map(|id| handle.header(id).map(|hdr| (id, hdr)))
                     .collect::<Result<Vec<_>, _>>()?;
                 Ok(out)
             },
-            Reduce::default(),
+            Reduce {
+                stats: Statistics {
+                    ids: extra_header_lookup.then(Vec::new),
+                    ..Default::default()
+                },
+            },
         )?
     } else {
+        if extra_header_lookup {
+            bail!("extra-header-lookup is only meaningful in threaded mode");
+        }
         let mut stats = Statistics::default();
 
         for (count, id) in object_ids.enumerate() {
@@ -193,6 +218,40 @@ pub fn statistics(
 
     progress.show_throughput(start);
 
+    if let Some(mut ids) = stats.ids.take() {
+        // Critical to re-open the repo to assure we don't have any ODB state and start fresh.
+        let start = std::time::Instant::now();
+        let repo = gix::open_opts(repo.git_dir(), repo.open_options().to_owned())?;
+        progress.set_name("re-counting".into());
+        progress.init(Some(ids.len()), gix::progress::count("objects"));
+        let counter = progress.counter();
+        counter.store(0, Ordering::Relaxed);
+        dbg!("starting");
+        let errors = gix::parallel::in_parallel_with_slice(
+            &mut ids,
+            thread_limit,
+            {
+                let objects = repo.objects.clone();
+                move |_| (objects.clone().into_inner(), counter, false)
+            },
+            |id, (odb, counter, has_error), _threads_left, _stop_everything| -> anyhow::Result<()> {
+                counter.fetch_add(1, Ordering::Relaxed);
+                if let Err(_err) = odb.header(id) {
+                    *has_error = true;
+                    gix::trace::error!(err = ?_err, "Object that is known to be present wasn't found");
+                }
+                Ok(())
+            },
+            || Some(std::time::Duration::from_millis(100)),
+            |(_, _, has_error)| has_error,
+        )?;
+
+        progress.show_throughput(start);
+        if errors.contains(&true) {
+            bail!("At least one object couldn't be looked up even though it must exist");
+        }
+    }
+
     #[cfg(feature = "serde")]
     {
         serde_json::to_writer_pretty(out, &stats)?;
diff --git a/gix-odb/src/store_impls/dynamic/load_index.rs b/gix-odb/src/store_impls/dynamic/load_index.rs
@@ -86,7 +86,7 @@ impl super::Store {
             Ok(Some(self.collect_snapshot()))
         } else {
             // always compare to the latest state
-            // Nothing changed in the mean time, try to load another index…
+            // Nothing changed in the meantime, try to load another index…
             if self.load_next_index(index) {
                 Ok(Some(self.collect_snapshot()))
             } else {
@@ -119,7 +119,7 @@ impl super::Store {
                         let slot = &self.files[index.slot_indices[slot_map_index]];
                         let _lock = slot.write.lock();
                         if slot.generation.load(Ordering::SeqCst) > index.generation {
-                            // There is a disk consolidation in progress which just overwrote a slot that cold be disposed with some other
+                            // There is a disk consolidation in progress which just overwrote a slot that could be disposed with some other
                             // index, one we didn't intend to load.
                             // Continue with the next slot index in the hope there is something else we can do…
                             continue 'retry_with_next_slot_index;
@@ -134,7 +134,8 @@ impl super::Store {
                                     slot.files.store(bundle);
                                     break 'retry_with_next_slot_index;
                                 }
-                                Err(_) => {
+                                Err(_err) => {
+                                    gix_features::trace::error!(err=?_err, "Failed to load index file - some objects may seem to not exist");
                                     slot.files.store(bundle);
                                     continue 'retry_with_next_slot_index;
                                 }
@@ -183,27 +184,30 @@ impl super::Store {
         let previous_index_state = Arc::as_ptr(&index) as usize;
 
         // IMPORTANT: get a lock after we recorded the previous state.
+        eprintln!("needs_init: {needs_init}, load_new_index: {load_new_index}");
         let write = self.write.lock();
         let objects_directory = &self.path;
 
         // Now we know the index isn't going to change anymore, even though threads might still load indices in the meantime.
         let index = self.index.load();
         if previous_index_state != Arc::as_ptr(&index) as usize {
             // Someone else took the look before and changed the index. Return it without doing any additional work.
+            dbg!("somebody already did the work - arc pointer changed");
             return Ok(Some(self.collect_snapshot()));
         }
 
         let was_uninitialized = !index.is_initialized();
 
         // We might not be able to detect by pointer if the state changed, as this itself is racy. So we keep track of double-initialization
         // using a flag, which means that if `needs_init` was true we saw the index uninitialized once, but now that we are here it's
-        // initialized meaning that somebody was faster and we couldn't detect it by comparisons to the index.
+        // initialized meaning that somebody was faster, and we couldn't detect it by comparisons to the index.
         // If so, make sure we collect the snapshot instead of returning None in case nothing actually changed, which is likely with a
         // race like this.
         if !was_uninitialized && needs_init {
+            dbg!("somebody already did the work - isn't uninitialized anymore");
             return Ok(Some(self.collect_snapshot()));
         }
-        self.num_disk_state_consolidation.fetch_add(1, Ordering::Relaxed);
+        dbg!(self.num_disk_state_consolidation.fetch_add(1, Ordering::Relaxed));
 
         let db_paths: Vec<_> = std::iter::once(objects_directory.to_owned())
             .chain(crate::alternate::resolve(objects_directory.clone(), &self.current_dir)?)
@@ -382,6 +386,7 @@ impl super::Store {
                     .unwrap_or_else(|| Arc::new(num_loaded_indices.into())),
                 num_indices_currently_being_loaded: Default::default(),
             });
+            dbg!("storing new index");
             self.index.store(new_index);
         }
 
@@ -408,7 +413,14 @@ impl super::Store {
 
         let new_index = self.index.load();
         Ok(if index.state_id() == new_index.state_id() {
-            // there was no change, and nothing was loaded in the meantime, reflect that in the return value to not get into loops
+            dbg!("nothing changed after reloading");
+            // there was no change, and nothing was loaded in the meantime, reflect that in the return value to not get into loops.
+            // However, if we came here in desperation for something, we should collect what's there for a chance that
+            // it will be what was needed.
+            // The original problem here is that a trampling-herd of threads want to load just a single index. Most of these threads
+            // notice that something changed in the meantime and can avoid checking disk once again. However, one might not get it
+            // as their index is just too new. Maybe there is a way to detect this kind of race or to prevent it?
+            // load_new_index.then(|| self.collect_snapshot())
             None
         } else {
             if load_new_index {
diff --git a/gix-odb/src/store_impls/dynamic/types.rs b/gix-odb/src/store_impls/dynamic/types.rs
@@ -18,7 +18,7 @@ pub(crate) type AtomicGeneration = AtomicU32;
 
 /// A way to indicate which pack indices we have seen already and which of them are loaded, along with an idea
 /// of whether stored `PackId`s are still usable.
-#[derive(Default, Copy, Clone)]
+#[derive(Default, Copy, Clone, Debug)]
 pub struct SlotIndexMarker {
     /// The generation the `loaded_until_index` belongs to. Indices of different generations are completely incompatible.
     /// This value changes once the internal representation is compacted, something that may happen only if there is no handle
diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs
@@ -1153,7 +1153,7 @@ pub fn main() -> Result<()> {
             ),
         },
         Subcommands::Odb(cmd) => match cmd {
-            odb::Subcommands::Stats => prepare_and_run(
+            odb::Subcommands::Stats { extra_header_lookup } => prepare_and_run(
                 "odb-stats",
                 trace,
                 auto_verbose,
@@ -1166,7 +1166,11 @@ pub fn main() -> Result<()> {
                         progress,
                         out,
                         err,
-                        core::repository::odb::statistics::Options { format, thread_limit },
+                        core::repository::odb::statistics::Options {
+                            format,
+                            thread_limit,
+                            extra_header_lookup,
+                        },
                     )
                 },
             ),
diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs
@@ -586,7 +586,11 @@ pub mod odb {
         Info,
         /// Count and obtain information on all, possibly duplicate, objects in the database.
         #[clap(visible_alias = "statistics")]
-        Stats,
+        Stats {
+            /// Lookup headers again, but without preloading indices.
+            #[clap(long)]
+            extra_header_lookup: bool,
+        },
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -586,7 +586,11 @@ pub mod odb {`
`586`	`586`	`Info,`
`587`	`587`	`/// Count and obtain information on all, possibly duplicate, objects in the database.`
`588`	`588`	`#[clap(visible_alias = "statistics")]`
`589`		`- Stats,`
	`589`	`+ Stats {`
	`590`	`+ /// Lookup headers again, but without preloading indices.`
	`591`	`+ #[clap(long)]`
	`592`	`+ extra_header_lookup: bool,`
	`593`	`+ },`
`590`	`594`	`}`
`591`	`595`	`}`
`592`	`596`