diff --git a/git-index/src/entry.rs b/git-index/src/entry.rs index efc51aa353c..460767a41ad 100644 --- a/git-index/src/entry.rs +++ b/git-index/src/entry.rs @@ -1,3 +1,4 @@ +use crate::Version; use bitflags::bitflags; /// The stage of an entry, one of 0 = base, 1 = ours, 2 = theirs @@ -42,6 +43,13 @@ pub(crate) mod at_rest { } } + impl Flags { + pub fn to_memory(self) -> super::Flags { + super::Flags::from_bits((self & (Flags::PATH_LEN | Flags::STAGE_MASK | Flags::ASSUME_VALID)).bits as u32) + .expect("PATHLEN is part of memory representation") + } + } + bitflags! { /// Extended flags - add flags for serialization here and offset them down to u16. pub struct FlagsExtended: u16 { @@ -56,13 +64,6 @@ pub(crate) mod at_rest { } } - impl Flags { - pub fn to_memory(self) -> super::Flags { - super::Flags::from_bits((self & (Flags::PATH_LEN | Flags::STAGE_MASK | Flags::ASSUME_VALID)).bits as u32) - .expect("PATHLEN is part of memory representation") - } - } - #[cfg(test)] mod tests { use super::*; @@ -112,6 +113,11 @@ impl Flags { pub fn stage(&self) -> Stage { (*self & Flags::STAGE_MASK).bits >> 12 } + + pub fn to_storage(&self, version: Version) -> at_rest::Flags { + assert_eq!(version, Version::V2, "Can only encode V2 flags at the moment"); + at_rest::Flags::from_bits(self.bits() as u16).unwrap() + } } #[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] @@ -173,3 +179,20 @@ mod _impls { } } } + +#[cfg(test)] +mod tests { + use crate::entry::at_rest; + use crate::Version; + + #[test] + fn in_mem_flags_to_storage_flags_v2() { + let flag_bytes = u16::from_be_bytes(*b"\x00\x01"); + let flags_at_rest = at_rest::Flags::from_bits(flag_bytes).unwrap(); + let in_memory_flags = flags_at_rest.to_memory(); + + let output = in_memory_flags.to_storage(Version::V2); + + assert_eq!(output.bits(), flag_bytes); + } +} diff --git a/git-index/src/extension/mod.rs b/git-index/src/extension/mod.rs index cbf7c0e313f..143f034fd35 100644 --- a/git-index/src/extension/mod.rs +++ b/git-index/src/extension/mod.rs @@ -14,7 +14,7 @@ pub struct Iter<'a> { /// /// It allows to more quickly build trees by avoiding as it can quickly re-use portions of the index and its associated tree ids /// if there was no change to them. Portions of this tree are invalidated as the index is changed. -#[derive(Clone)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct Tree { pub name: SmallVec<[u8; 23]>, /// The id of the directory tree of the associated tree object. diff --git a/git-index/src/file/mod.rs b/git-index/src/file/mod.rs index 700c7934b39..a3eef785c9a 100644 --- a/git-index/src/file/mod.rs +++ b/git-index/src/file/mod.rs @@ -32,3 +32,4 @@ mod impl_ { } pub mod init; pub mod verify; +pub mod write; diff --git a/git-index/src/file/write.rs b/git-index/src/file/write.rs new file mode 100644 index 00000000000..26aa0168917 --- /dev/null +++ b/git-index/src/file/write.rs @@ -0,0 +1,12 @@ +use crate::{write, File}; +use git_features::hash; + +impl File { + pub fn write_to(&self, mut out: &mut impl std::io::Write, options: write::Options) -> std::io::Result<()> { + let mut hasher = hash::Write::new(&mut out, options.hash_kind); + self.state.write_to(&mut hasher, options)?; + + let hash = hasher.hash.digest(); + out.write_all(&hash) + } +} diff --git a/git-index/src/write.rs b/git-index/src/write.rs index 73592dd6548..a8c24b363a7 100644 --- a/git-index/src/write.rs +++ b/git-index/src/write.rs @@ -1,162 +1,231 @@ -use std::{ - collections::{hash_map, HashMap}, - ops::Range, -}; - -use bstr::ByteVec; - use crate::{extension, State, Version}; +use std::convert::{TryFrom, TryInto}; +use std::io::Write; -impl State { - pub fn write_to(&self, options: Options) -> Vec { - let mut writer = Writer::new(self, options); - writer.generate(); - writer.data - } -} - -#[derive(Default)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct Options { - hash_kind: git_hash::Kind, -} - -struct Writer<'a> { - state: &'a State, - options: Options, - data: Vec, - index_table: HashMap<&'static str, Range>, + /// The hash kind to use when writing the index file. + /// + /// It is not always possible to infer the hash kind when reading an index, so this is required. + pub hash_kind: git_hash::Kind, + /// The index version to write. Note that different versions affect the format and ultimately the size. + pub version: Version, + + pub tree_cache_extension: bool, + pub end_of_index_entry_extension: bool, } -impl<'a> Writer<'a> { - pub fn new(state: &'a State, options: Options) -> Self { +impl Default for Options { + fn default() -> Self { Self { - state, - options, - data: Vec::default(), - index_table: Default::default(), + hash_kind: git_hash::Kind::default(), + version: Version::V2, + tree_cache_extension: true, + end_of_index_entry_extension: true, } } +} - pub fn generate(&mut self) { - self.header(); - self.entries(); +impl State { + pub fn write_to( + &self, + out: &mut impl std::io::Write, + Options { + hash_kind, + version, + tree_cache_extension, + end_of_index_entry_extension, + }: Options, + ) -> std::io::Result<()> { + assert_eq!( + version, + Version::V2, + "can only write V2 at the moment, please come back later" + ); + + let mut write = CountBytes::new(out); + let num_entries = self + .entries() + .len() + .try_into() + .expect("definitely not 4billion entries"); + + let header_offset = header(&mut write, version, num_entries)?; + let entries_offset = entries(&mut write, self, header_offset, version)?; + let tree_offset = if tree_cache_extension { + tree_ext(&mut write, self.tree())? + } else { + entries_offset + }; - // TODO: Tree extension is always included, I think - if let Some(t) = self.state.tree() { - self.tree(t) + if num_entries > 0 && end_of_index_entry_extension { + end_of_index_entry_ext(write.inner, hash_kind, entries_offset, tree_offset)?; } - self.end_of_index(); + Ok(()) } +} + +fn header( + out: &mut CountBytes<'_, T>, + version: Version, + num_entries: u32, +) -> Result { + let signature = b"DIRC"; + + let version = match version { + Version::V2 => 2_u32.to_be_bytes(), + Version::V3 => 3_u32.to_be_bytes(), + Version::V4 => 4_u32.to_be_bytes(), + }; + + out.write_all(signature)?; + out.write_all(&version)?; + out.write_all(&num_entries.to_be_bytes())?; - fn push(&mut self, data: &[u8], key: &'static str) { - let start = self.data.len(); - let end = start + data.len(); + Ok(out.count) +} - match self.index_table.entry(key) { - hash_map::Entry::Occupied(mut e) => e.get_mut().end = end, - hash_map::Entry::Vacant(e) => { - e.insert(start..end); +fn entries( + out: &mut CountBytes<'_, T>, + state: &State, + header_size: u32, + version: Version, +) -> Result { + for entry in state.entries() { + let stat = entry.stat; + out.write_all(&stat.ctime.secs.to_be_bytes())?; + out.write_all(&stat.ctime.nsecs.to_be_bytes())?; + out.write_all(&stat.mtime.secs.to_be_bytes())?; + out.write_all(&stat.mtime.nsecs.to_be_bytes())?; + out.write_all(&stat.dev.to_be_bytes())?; + out.write_all(&stat.ino.to_be_bytes())?; + out.write_all(&entry.mode.bits().to_be_bytes())?; + out.write_all(&stat.uid.to_be_bytes())?; + out.write_all(&stat.gid.to_be_bytes())?; + out.write_all(&stat.size.to_be_bytes())?; + out.write_all(entry.id.as_bytes())?; + let path = entry.path(state); + let path_len: u16 = path + .len() + .try_into() + .expect("Cannot handle paths longer than 16bits ever"); + assert!( + path_len <= 0xFFF, + "Paths can't be longer than 12 bits as they share space with bit flags in a u16" + ); + out.write_all(&(entry.flags.to_storage(version).bits() | path_len).to_be_bytes())?; + out.write_all(path)?; + out.write_all(b"\0")?; + + match (out.count - header_size) % 8 { + 0 => {} + n => { + let eight_null_bytes = [0u8; 8]; + out.write_all(&eight_null_bytes[n as usize..])?; } }; - - self.data.push_str(data); } - fn header(&mut self) { - let signature = b"DIRC"; - let version = match self.state.version() { - Version::V2 => 2_u32.to_be_bytes(), - Version::V3 => 3_u32.to_be_bytes(), - Version::V4 => 4_u32.to_be_bytes(), - }; - let num_entries = self.state.entries().len() as u32; + Ok(out.count) +} - self.push(signature, "header"); - self.push(&version, "header"); - self.push(&(num_entries).to_be_bytes(), "header"); - } +fn tree_ext( + out: &mut CountBytes<'_, T>, + tree: Option<&extension::Tree>, +) -> Result { + fn tree_entry(out: &mut impl std::io::Write, tree: &extension::Tree) -> Result<(), std::io::Error> { + let num_entries_ascii = tree.num_entries.to_string(); + let num_children_ascii = tree.children.len().to_string(); + + out.write_all(tree.name.as_slice())?; + out.write_all(b"\0")?; + out.write_all(num_entries_ascii.as_bytes())?; + out.write_all(b" ")?; + out.write_all(num_children_ascii.as_bytes())?; + out.write_all(b"\n")?; + out.write_all(tree.id.as_bytes())?; - fn entries(&mut self) { - for e in self.state.entries() { - self.push(&e.stat.ctime.secs.to_be_bytes(), "entries"); - self.push(&e.stat.ctime.nsecs.to_be_bytes(), "entries"); - self.push(&e.stat.mtime.secs.to_be_bytes(), "entries"); - self.push(&e.stat.mtime.nsecs.to_be_bytes(), "entries"); - self.push(&e.stat.dev.to_be_bytes(), "entries"); - self.push(&e.stat.ino.to_be_bytes(), "entries"); - self.push(&e.mode.bits().to_be_bytes(), "entries"); - self.push(&e.stat.uid.to_be_bytes(), "entries"); - self.push(&e.stat.gid.to_be_bytes(), "entries"); - self.push(&e.stat.size.to_be_bytes(), "entries"); - self.push(e.id.as_bytes(), "entries"); - //FIXME: correct flag values - // probably convert 'in-memory' Flags to at_rest::Flags - // self.push(&e.flags.bits().to_be_bytes(), "entries"); - self.push(b"\x00\x01\x61\x00", "entries"); - - println!("{:?}", e.flags.bits()); + for child in &tree.children { + tree_entry(out, child)?; } + + Ok(()) } - fn tree(&mut self, tree: &extension::Tree) { - let signature = b"TREE"; - let mut size: u32 = 0; + if let Some(tree) = tree { + let signature = extension::tree::SIGNATURE; - self.push(signature, "tree"); - self.push(&size.to_be_bytes(), "tree"); + let estimated_size = tree.num_entries * (300 + 3 + 1 + 3 + 1 + 20); + let mut entries: Vec = Vec::with_capacity(estimated_size as usize); + tree_entry(&mut entries, tree)?; - self.tree_entry(tree); + out.write_all(&signature)?; + out.write_all(&(u32::try_from(entries.len()).expect("less than 4GB tree extension")).to_be_bytes())?; + out.write_all(&entries)?; + } - if let Some(range) = self.index_table.get("tree") { - size = (range.end - (range.start + 8)) as u32; - self.data[range.start + 4..range.start + 8].copy_from_slice(&size.to_be_bytes()); - } + Ok(out.count) +} + +fn end_of_index_entry_ext( + out: &mut impl std::io::Write, + hash_kind: git_hash::Kind, + entries_offset: u32, + tree_offset: u32, +) -> Result<(), std::io::Error> { + let signature = extension::end_of_index_entry::SIGNATURE; + let extension_size = 4 + hash_kind.len_in_bytes() as u32; + + let mut hasher = git_features::hash::hasher(hash_kind); + let tree_size = (tree_offset - entries_offset).saturating_sub(8); + if tree_size > 0 { + hasher.update(&extension::tree::SIGNATURE); + hasher.update(&tree_size.to_be_bytes()); } + let hash = hasher.digest(); - fn tree_entry(&mut self, tree: &extension::Tree) { - let path = [tree.name.as_slice(), b"\0"].concat(); + out.write_all(&signature)?; + out.write_all(&extension_size.to_be_bytes())?; + out.write_all(&entries_offset.to_be_bytes())?; + out.write_all(&hash)?; - let num_entries_ascii = tree.num_entries.to_string(); - let num_children_ascii = tree.children.len().to_string(); + Ok(()) +} - self.push(path.as_slice(), "tree"); - self.push(num_entries_ascii.as_bytes(), "tree"); - self.push(b" ", "tree"); - self.push(num_children_ascii.as_bytes(), "tree"); - self.push(b"\n", "tree"); - self.push(tree.id.as_bytes(), "tree"); +struct CountBytes<'a, T> { + count: u32, + inner: &'a mut T, +} - for child in &tree.children { - self.tree_entry(child); - } +impl<'a, T> CountBytes<'a, T> +where + T: std::io::Write, +{ + pub fn new(inner: &'a mut T) -> Self { + CountBytes { inner, count: 0 } } +} - fn end_of_index(&mut self) { - match self.index_table.get("entries") { - Some(range) => { - let signature = b"EOIE"; - let extension_size = 4 + self.options.hash_kind.len_in_bytes() as u32; - let offset: u32 = range.end as u32; - - let mut hasher = git_features::hash::hasher(self.options.hash_kind); - - match self.index_table.get("tree") { - Some(range) => { - hasher.update(b"TREE"); - hasher.update(&self.data[range.start + 4..range.start + 8]); - } - None => {} - } - - let hash = hasher.digest(); - - self.data.push_str(signature); - self.data.push_str(extension_size.to_be_bytes()); - self.data.push_str(offset.to_be_bytes()); - self.data.push_str(hash); - } - None => {} - } +impl<'a, T> std::io::Write for CountBytes<'a, T> +where + T: std::io::Write, +{ + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let written = self.inner.write(buf)?; + self.count = self + .count + .checked_add(u32::try_from(written).expect("we don't write 4GB buffers")) + .ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::Other, + "Cannot write indices larger than 4 gigabytes", + ) + })?; + Ok(written) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() } } diff --git a/git-index/tests/index/file/write.rs b/git-index/tests/index/file/write.rs index aa43428167b..b8908522fc7 100644 --- a/git-index/tests/index/file/write.rs +++ b/git-index/tests/index/file/write.rs @@ -1,38 +1,168 @@ -use git_index::write::Options; +use filetime::FileTime; +use git_index::{decode, write, State, Version}; +use std::cmp::{max, min}; #[test] -fn v2_empty() { - let path = crate::fixture_index_path("V2_empty"); - let index = git_index::File::at(&path, git_index::decode::Options::default()).unwrap(); - let expected = std::fs::read(&path).unwrap(); - let expected_without_hash = &expected[..expected.len() - 20]; +fn roundtrips() { + let input = [ + ("V2_empty", write::Options::default()), + ("v2", write::Options::default()), + ( + "v2_more_files", + write::Options { + end_of_index_entry_extension: false, + ..write::Options::default() + }, + ), + ]; - let output = index.write_to(Options::default()); + for (fixture, options) in input { + let path = crate::fixture_index_path(fixture); + let expected_index = git_index::File::at(&path, decode::Options::default()).unwrap(); + let expected_bytes = std::fs::read(&path).unwrap(); + let mut out_bytes = Vec::::new(); - assert_eq!(output, expected_without_hash); + expected_index.write_to(&mut out_bytes, options).unwrap(); + let (out_index, _) = State::from_bytes(&out_bytes, FileTime::now(), decode::Options::default()).unwrap(); + + compare_states(&out_index, &expected_index, options, fixture); + compare_raw_bytes(&out_bytes, &expected_bytes, fixture); + } } #[test] -fn v2() { - let path = crate::fixture_index_path("v2"); - let index = git_index::File::at(&path, git_index::decode::Options::default()).unwrap(); - let expected = std::fs::read(&path).unwrap(); - let expected_without_hash = &expected[..expected.len() - 20]; +fn v2_index_no_extensions() { + let input = [ + "V2_empty", + "v2", + "v2_more_files", + "v2_split_index", + "v4_more_files_IEOT", + ]; + + for fixture in input { + let path = crate::fixture_index_path(fixture); + let expected = git_index::File::at(&path, decode::Options::default()).unwrap(); + + let mut out = Vec::::new(); + let options = write::Options { + hash_kind: git_hash::Kind::Sha1, + version: Version::V2, + tree_cache_extension: false, + end_of_index_entry_extension: false, + }; + + expected.write_to(&mut out, options).unwrap(); + + let (generated, _) = State::from_bytes(&out, FileTime::now(), decode::Options::default()).unwrap(); + compare_states(&generated, &expected, options, fixture); + } +} + +#[test] +fn v2_index_tree_extensions() { + let input = [ + "V2_empty", + "v2", + "v2_more_files", + "v2_split_index", + "v4_more_files_IEOT", + ]; + + for fixture in input { + let path = crate::fixture_index_path(fixture); + let expected = git_index::File::at(&path, decode::Options::default()).unwrap(); - let output = index.write_to(Options::default()); + let mut out = Vec::::new(); + let options = write::Options { + hash_kind: git_hash::Kind::Sha1, + version: Version::V2, + tree_cache_extension: true, + end_of_index_entry_extension: false, + }; - assert_eq!(output, expected_without_hash); + expected.write_to(&mut out, options).unwrap(); + + let (generated, _) = State::from_bytes(&out, FileTime::now(), decode::Options::default()).unwrap(); + compare_states(&generated, &expected, options, fixture); + } } #[test] -#[ignore] -fn v2_more_files() { - let path = crate::fixture_index_path("v2_more_files"); - let index = git_index::File::at(&path, git_index::decode::Options::default()).unwrap(); - let expected = std::fs::read(&path).unwrap(); - let expected_without_hash = &expected[..expected.len() - 20]; +fn v2_index_eoie_extensions() { + let input = [ + "V2_empty", + "v2", + "v2_more_files", + "v2_split_index", + "v4_more_files_IEOT", + ]; + + for fixture in input { + let path = crate::fixture_index_path(fixture); + let expected = git_index::File::at(&path, decode::Options::default()).unwrap(); + + let mut out = Vec::::new(); + let options = write::Options { + hash_kind: git_hash::Kind::Sha1, + version: Version::V2, + tree_cache_extension: false, + end_of_index_entry_extension: true, + }; + + expected.write_to(&mut out, options).unwrap(); + + let (generated, _) = State::from_bytes(&out, FileTime::now(), decode::Options::default()).unwrap(); + compare_states(&generated, &expected, options, fixture); + } +} + +fn compare_states(generated: &State, expected: &State, options: write::Options, fixture: &str) { + assert_eq!(generated.version(), options.version, "version mismatch in {}", fixture); + assert_eq!( + generated.tree(), + match options.tree_cache_extension { + true => expected.tree(), + false => None, + }, + "tree extension mismatch in {}", + fixture + ); + assert_eq!( + generated.entries().len(), + expected.entries().len(), + "entry count mismatch in {}", + fixture + ); + assert_eq!( + generated.entries(), + expected.entries(), + "entries mismatch in {}", + fixture + ); + assert_eq!( + generated.path_backing(), + expected.path_backing(), + "path_backing mismatch in {}", + fixture + ); +} + +fn compare_raw_bytes(generated: &[u8], expected: &[u8], fixture: &str) { + assert_eq!(generated.len(), expected.len(), "file length mismatch in {}", fixture); - let output = index.write_to(Options::default()); + let print_range = 10; + for (index, (a, b)) in generated.iter().zip(expected.iter()).enumerate() { + if a != b { + let range_left = max(index - print_range, 0); + let range_right = min(index + print_range, generated.len()); + let generated = &generated[range_left..range_right]; + let expected = &expected[range_left..range_right]; - assert_eq!(output, expected_without_hash); + panic! {"\n\nRoundtrip failed for index in fixture {:?} at position {:?}\n\ + \t Input: ... {:?} ...\n\ + \tExpected: ... {:?} ...\n\n\ + ", &fixture, index, generated, expected} + } + } }