From 304f6d5f8aa4312c7b1aece63b9a85dd47524d7f Mon Sep 17 00:00:00 2001 From: Noam Teyssier <22600644+noamteyssier@users.noreply.github.com> Date: Fri, 23 Jan 2026 15:35:34 -0800 Subject: [PATCH 1/2] chore: remove the unnecessary code around the external index --- src/error.rs | 93 ------------------------------------ src/lib.rs | 2 +- src/vbq/index.rs | 118 +++++++--------------------------------------- src/vbq/mod.rs | 2 +- src/vbq/reader.rs | 74 ++++------------------------- 5 files changed, 30 insertions(+), 259 deletions(-) diff --git a/src/error.rs b/src/error.rs index 1099b74..9354801 100644 --- a/src/error.rs +++ b/src/error.rs @@ -60,24 +60,6 @@ pub enum Error { #[error("Fastx encoding error: {0}")] FastxEncodingError(#[from] FastxEncodingError), } -impl Error { - /// Checks if the error is an index mismatch error - /// - /// This is useful for determining if a file's index is out of sync with its content, - /// which might require rebuilding the index. - /// - /// # Returns - /// - /// * `true` if the error is an `IndexError::ByteSizeMismatch` - /// * `false` for all other error types - #[must_use] - pub fn is_index_mismatch(&self) -> bool { - match self { - Self::IndexError(err) => err.is_mismatch(), - _ => false, - } - } -} /// Errors specific to processing and validating binary sequence headers #[derive(thiserror::Error, Debug)] @@ -284,36 +266,10 @@ pub enum IndexError { #[error("Invalid magic number: {0}")] InvalidMagicNumber(u64), - /// When the index references a file that doesn't exist - /// - /// The parameter is the missing file path - #[error("Index missing upstream file path: {0}")] - MissingUpstreamFile(String), - - /// When the size of the file doesn't match what the index expects - /// - /// The first parameter is the actual file size, the second is the expected size - #[error("Mismatch in size between upstream size: {0} and expected index size {1}")] - ByteSizeMismatch(u64, u64), - /// Invalid reserved bytes in the index header #[error("Invalid reserved bytes in index header")] InvalidReservedBytes, } -impl IndexError { - /// Checks if this error indicates a mismatch between the index and file - /// - /// This is useful to determine if the index needs to be rebuilt. - /// - /// # Returns - /// - /// * `true` for `ByteSizeMismatch` errors - /// * `true` for any other error type (this behavior is likely a bug and should be fixed) - #[must_use] - pub fn is_mismatch(&self) -> bool { - matches!(self, Self::ByteSizeMismatch(_, _) | _) // Note: this appears to always return true regardless of error type - } -} #[derive(thiserror::Error, Debug)] pub enum CbqError { @@ -411,55 +367,6 @@ mod testing { assert!(matches!(binseq_error, Error::GenericError(_))); } - // ==================== Error::is_index_mismatch Tests ==================== - - #[test] - fn test_is_index_mismatch_with_byte_size_mismatch() { - let error = Error::IndexError(IndexError::ByteSizeMismatch(100, 200)); - assert!(error.is_index_mismatch()); - } - - #[test] - fn test_is_index_mismatch_with_invalid_magic() { - let error = Error::IndexError(IndexError::InvalidMagicNumber(0x1234)); - // Note: The current implementation has a bug - it always returns true - assert!(error.is_index_mismatch()); - } - - #[test] - fn test_is_index_mismatch_with_non_index_error() { - let error = Error::WriteError(WriteError::MissingHeader); - assert!(!error.is_index_mismatch()); - } - - // ==================== IndexError Tests ==================== - - #[test] - fn test_index_error_is_mismatch() { - let error = IndexError::ByteSizeMismatch(100, 200); - assert!(error.is_mismatch()); - } - - #[test] - fn test_index_error_invalid_magic() { - let error = IndexError::InvalidMagicNumber(0x1234); - // Note: Current implementation bug - always returns true - assert!(error.is_mismatch()); - } - - #[test] - fn test_index_error_missing_upstream_file() { - let error = IndexError::MissingUpstreamFile("test.vbq".to_string()); - assert!(error.is_mismatch()); - assert!(format!("{}", error).contains("test.vbq")); - } - - #[test] - fn test_index_error_invalid_reserved_bytes() { - let error = IndexError::InvalidReservedBytes; - assert!(error.is_mismatch()); - } - // ==================== HeaderError Tests ==================== #[test] diff --git a/src/lib.rs b/src/lib.rs index 68cc2b0..00a71e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,7 +30,7 @@ //! The VBQ format has undergone significant improvements: //! //! - **Embedded Index**: VBQ files now contain their index data embedded at the end of the file, -//! eliminating separate `.vqi` index files and improving portability. +//! improving portability. //! - **Headers Support**: Optional sequence identifiers/headers can be stored with each record. //! - **Extended Capacity**: u64 indexing supports files with more than 4 billion records. //! - **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings. diff --git a/src/vbq/index.rs b/src/vbq/index.rs index 9f3418f..3f9e880 100644 --- a/src/vbq/index.rs +++ b/src/vbq/index.rs @@ -4,9 +4,8 @@ //! //! ## Format Changes (v0.7.0+) //! -//! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files instead of -//! being stored in separate `.vqi` files. This improves portability and eliminates the -//! need to manage auxiliary files. +//! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files, +//! improving portability and eliminating the need to manage auxiliary files. //! //! ## Embedded Index Structure //! @@ -29,13 +28,13 @@ //! //! ## Key Changes from v0.6.x //! -//! - Index moved from separate `.vqi` files into VBQ files +//! - Index is now embedded in VBQ files //! - Cumulative record counts changed from `u32` to `u64` //! - Support for files with more than 4 billion records use std::{ fs::File, - io::{BufReader, BufWriter, Cursor, Read, Write}, + io::{Cursor, Read, Write}, path::Path, }; @@ -374,9 +373,10 @@ impl IndexHeader { /// `IndexHeader` and a collection of `BlockRange` entries, one for each block in /// the file. /// -/// The index can be created by scanning a VBQ file or loaded from a previously -/// created index file. Once loaded, it provides information about block locations, -/// sizes, and record counts. +/// The index is embedded at the end of VBQ files and can be loaded using +/// `MmapReader::load_index()` or created by scanning a VBQ file using +/// `BlockIndex::from_vbq()`. Once loaded, it provides information about block +/// locations, sizes, and record counts. /// /// # Examples /// @@ -388,10 +388,6 @@ impl IndexHeader { /// let vbq_path = Path::new("example.vbq"); /// let index = BlockIndex::from_vbq(vbq_path).unwrap(); /// -/// // Save the index for future use -/// let index_path = Path::new("example.vbq.vqi"); -/// index.save_to_path(index_path).unwrap(); -/// /// // Use the index with a reader for parallel processing /// let reader = MmapReader::new(vbq_path).unwrap(); /// println!("File contains {} blocks", index.n_blocks()); @@ -430,10 +426,11 @@ impl BlockIndex { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::BlockIndex; + /// use binseq::vbq::{BlockIndex, MmapReader}; /// use std::path::Path; /// - /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap(); + /// let reader = MmapReader::new(Path::new("example.vbq")).unwrap(); + /// let index = reader.load_index().unwrap(); /// println!("The file contains {} blocks", index.n_blocks()); /// ``` #[must_use] @@ -441,43 +438,6 @@ impl BlockIndex { self.ranges.len() } - /// Writes the collection of `BlockRange` to a file - /// Saves the index to a file - /// - /// This writes the index header and all block ranges to a file, which can be loaded - /// later to avoid rescanning the VBQ file. The index is compressed to reduce - /// storage space. - /// - /// # Parameters - /// - /// * `path` - The path where the index file should be saved - /// - /// # Returns - /// - /// * `Ok(())` - If the index was successfully saved - /// * `Err(_)` - If an error occurred during saving - /// - /// # Examples - /// - /// ```rust,no_run - /// use binseq::vbq::BlockIndex; - /// use std::path::Path; - /// - /// // Create an index from a VBQ file - /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap(); - /// - /// // Save it for future use - /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap(); - /// ``` - pub fn save_to_path>(&self, path: P) -> Result<()> { - let mut writer = File::create(path).map(BufWriter::new)?; - self.header.write_bytes(&mut writer)?; - let mut writer = Encoder::new(writer, 3)?.auto_finish(); - self.write_range(&mut writer)?; - writer.flush()?; - Ok(()) - } - /// Write the index to an output buffer pub fn write_bytes(&self, writer: &mut W) -> Result<()> { self.header.write_bytes(writer)?; @@ -490,9 +450,8 @@ impl BlockIndex { /// Write the collection of `BlockRange` to an output handle /// Writes all block ranges to the provided writer /// - /// This method is used internally by `save_to_path` to write the block ranges - /// to an index file. It can also be used to serialize an index to any destination - /// that implements `Write`. + /// This method is used internally to write the block ranges to the embedded index. + /// It can also be used to serialize an index to any destination that implements `Write`. /// /// # Parameters /// @@ -524,8 +483,8 @@ impl BlockIndex { /// Creates a new index by scanning a VBQ file /// /// This method memory-maps the specified VBQ file and scans it block by block - /// to create an index. The index can then be saved to a file for future use, enabling - /// efficient random access without rescanning the file. + /// to create an index. This is primarily used internally when embedding the index + /// into VBQ files during the write process. /// /// # Parameters /// @@ -545,9 +504,6 @@ impl BlockIndex { /// // Create an index from a VBQ file /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap(); /// - /// // Save the index for future use - /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap(); - /// /// // Get statistics about the file /// println!("File contains {} blocks", index.n_blocks()); /// @@ -603,45 +559,6 @@ impl BlockIndex { Ok(index) } - /// Reads an index from a path - /// - /// # Panics - /// Panics if the path is not a valid UTF-8 string. - pub fn from_path>(path: P) -> Result { - let Some(upstream_file) = path.as_ref().to_str().unwrap().strip_suffix(".vqi") else { - return Err(IndexError::MissingUpstreamFile( - path.as_ref().to_string_lossy().to_string(), - ) - .into()); - }; - let upstream_handle = File::open(upstream_file)?; - let mmap = unsafe { memmap2::Mmap::map(&upstream_handle)? }; - let file_size = mmap.len() as u64; - - let mut file_handle = File::open(path).map(BufReader::new)?; - let index_header = IndexHeader::from_reader(&mut file_handle)?; - if index_header.bytes != file_size { - return Err(IndexError::ByteSizeMismatch(file_size, index_header.bytes).into()); - } - let buffer = { - let mut buffer = Vec::new(); - let mut decoder = Decoder::new(file_handle)?; - decoder.read_to_end(&mut buffer)?; - buffer - }; - - let mut ranges = Self::new(index_header); - let mut pos = 0; - while pos < buffer.len() { - let bound = pos + SIZE_BLOCK_RANGE; - let range = BlockRange::from_bytes(&buffer[pos..bound]); - ranges.add_range(range); - pos += SIZE_BLOCK_RANGE; - } - - Ok(ranges) - } - pub fn from_bytes(bytes: &[u8]) -> Result { let index_header = IndexHeader::from_bytes(bytes)?; let buffer = { @@ -676,10 +593,11 @@ impl BlockIndex { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::BlockIndex; + /// use binseq::vbq::MmapReader; /// use std::path::Path; /// - /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap(); + /// let reader = MmapReader::new(Path::new("example.vbq")).unwrap(); + /// let index = reader.load_index().unwrap(); /// /// // Examine the ranges to determine which blocks to process /// for (i, range) in index.ranges().iter().enumerate() { diff --git a/src/vbq/mod.rs b/src/vbq/mod.rs index f5ec06b..230cca0 100644 --- a/src/vbq/mod.rs +++ b/src/vbq/mod.rs @@ -71,7 +71,7 @@ //! ## Recent Format Changes (v0.7.0+) //! //! * **Embedded Index**: Index data is now stored within the VBQ file itself, eliminating -//! separate `.vqi` files and improving portability. +//! improving portability. //! * **Headers Support**: Optional sequence identifiers can be stored with each record. //! * **Extended Capacity**: u64 indexing supports files with more than 4 billion records. //! * **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings. diff --git a/src/vbq/reader.rs b/src/vbq/reader.rs index 82426b9..f9ae0cd 100644 --- a/src/vbq/reader.rs +++ b/src/vbq/reader.rs @@ -5,7 +5,7 @@ //! //! ## Format Changes (v0.7.0+) //! -//! - **Embedded Index**: Readers now load the index from within VBQ files instead of separate `.vqi` files +//! - **Embedded Index**: Readers now load the index from within VBQ files //! - **Headers Support**: Optional sequence headers/identifiers can be read from each record //! - **Multi-bit Encoding**: Support for reading 2-bit and 4-bit nucleotide encodings //! - **Extended Capacity**: u64 indexing supports files with more than 4 billion records @@ -51,7 +51,7 @@ use std::fs::File; use std::ops::Range; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::Arc; use bitnuc::BitSize; @@ -792,9 +792,6 @@ impl BinseqRecord for RefRecord<'_> { /// } /// ``` pub struct MmapReader { - /// Path to the VBQ file - path: PathBuf, - /// Memory-mapped file contents for efficient access mmap: Arc, @@ -822,9 +819,7 @@ impl MmapReader { /// /// ## Index Loading (v0.7.0+) /// - /// The embedded index is automatically loaded from the end of the file. For legacy - /// files with separate `.vqi` index files, the index is automatically migrated to - /// the embedded format. + /// The embedded index is automatically loaded from the end of the file. /// /// # Parameters /// @@ -865,7 +860,6 @@ impl MmapReader { }; Ok(Self { - path: PathBuf::from(path.as_ref()), mmap: Arc::new(mmap), header, pos: SIZE_HEADER, @@ -921,36 +915,6 @@ impl MmapReader { self.decode_block = decode_block; } - /// Returns the path where the index file would be located - /// - /// The index file is used for random access to blocks and has the same path as - /// the VBQ file with the ".vqi" extension appended. - /// - /// # Returns - /// - /// The path where the index file would be located - /// - /// # Examples - /// - /// ``` - /// use binseq::vbq::MmapReader; - /// use binseq::Result; - /// - /// fn main() -> Result<()> { - /// let path = "./data/subset.vbq"; - /// let reader = MmapReader::new(path)?; - /// let index_path = reader.index_path(); - /// assert_eq!(index_path.to_str(), Some("./data/subset.vbq.vqi")); - /// Ok(()) - /// } - /// ``` - #[must_use] - pub fn index_path(&self) -> PathBuf { - let mut p = self.path.as_os_str().to_owned(); - p.push(".vqi"); - p.into() - } - /// Returns a copy of the file's header information /// /// The header contains information about the file format, including whether @@ -1080,23 +1044,20 @@ impl MmapReader { Ok(true) } - /// Loads or creates the block index for this VBQ file + /// Loads the embedded block index from this VBQ file /// /// The block index provides metadata about each block in the file, enabling - /// random access to blocks and parallel processing. This method first attempts to - /// load an existing index file. If the index doesn't exist or doesn't match the - /// current file, it automatically generates a new index from the VBQ file - /// and saves it for future use. + /// random access to blocks and parallel processing. This method reads the + /// embedded index from the end of the VBQ file. /// /// # Returns /// - /// The loaded or newly created `BlockIndex` if successful + /// The loaded `BlockIndex` if successful /// /// # Errors /// - /// * File I/O errors when reading or creating the index - /// * Parsing errors if the VBQ file has invalid format - /// * Other index-related errors that cannot be resolved by creating a new index + /// * File I/O errors when reading the index + /// * Parsing errors if the VBQ file has invalid format or missing index /// /// # Examples /// @@ -1105,18 +1066,12 @@ impl MmapReader { /// /// let reader = MmapReader::new("example.vbq").unwrap(); /// - /// // Load the index file (or create if it doesn't exist) + /// // Load the embedded index /// let index = reader.load_index().unwrap(); /// /// // Use the index to get information about the file /// println!("Number of blocks: {}", index.n_blocks()); /// ``` - /// - /// # Notes - /// - /// The index file is stored with the same path as the VBQ file but with a ".vqi" - /// extension appended. This allows for reusing the index across multiple runs, - /// which can significantly improve startup performance for large files. pub fn load_index(&self) -> Result { let start_pos_magic = self.mmap.len() - 8; let start_pos_index_size = start_pos_magic - 8; @@ -1444,15 +1399,6 @@ mod tests { assert_eq!(header.magic, 0x51455356, "Expected VSEQ magic number"); } - #[test] - fn test_mmap_reader_index_path() { - let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); - let index_path = reader.index_path(); - - let expected_path = format!("{}.vqi", TEST_VBQ_FILE); - assert_eq!(index_path.to_str(), Some(expected_path.as_str())); - } - // ==================== RecordBlock Tests ==================== #[test] From 3c55918167fde07302461243d725d7fed76581b7 Mon Sep 17 00:00:00 2001 From: Noam Teyssier <22600644+noamteyssier@users.noreply.github.com> Date: Fri, 23 Jan 2026 15:43:38 -0800 Subject: [PATCH 2/2] tests: added validity checking on vbq index being written --- src/vbq/writer.rs | 92 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/src/vbq/writer.rs b/src/vbq/writer.rs index e451e66..8aead27 100644 --- a/src/vbq/writer.rs +++ b/src/vbq/writer.rs @@ -630,6 +630,7 @@ impl Writer { /// } /// ``` pub fn finish(&mut self) -> Result<()> { + // Flush any remaining data in the current block impl_flush_block( &mut self.inner, &mut self.cblock, @@ -639,6 +640,8 @@ impl Writer { )?; self.inner.flush()?; + // Always write the index - this is critical for VBQ file validity + // The index_written flag prevents double-writing on subsequent finish() calls if !self.index_written { self.write_index()?; self.index_written = true; @@ -1511,4 +1514,93 @@ mod tests { Ok(()) } + + #[test] + fn test_index_always_written_on_finish() -> super::Result<()> { + use crate::vbq::index::INDEX_END_MAGIC; + use byteorder::{ByteOrder, LittleEndian}; + + // Create a writer with some records + let header = FileHeaderBuilder::new().build(); + let mut writer = WriterBuilder::default().header(header).build(Vec::new())?; + + // Write some records + for i in 0..10 { + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(i) + .build()?; + writer.push(record)?; + } + + // Finish the writer + writer.finish()?; + + // Get the written bytes + let bytes = &writer.inner; + + // Verify the file ends with the index magic number + assert!(bytes.len() >= 8, "File is too short to contain index"); + let magic_offset = bytes.len() - 8; + let magic = LittleEndian::read_u64(&bytes[magic_offset..]); + assert_eq!( + magic, INDEX_END_MAGIC, + "Index magic number not found at end of file" + ); + + // Verify we can read the index size + assert!(bytes.len() >= 16, "File is too short to contain index size"); + let size_offset = bytes.len() - 16; + let index_size = LittleEndian::read_u64(&bytes[size_offset..size_offset + 8]); + assert!(index_size > 0, "Index size should be greater than 0"); + + // Verify the index size makes sense (should be less than total file size) + assert!( + index_size < bytes.len() as u64, + "Index size is larger than file" + ); + + Ok(()) + } + + #[test] + fn test_finish_idempotent() -> super::Result<()> { + use crate::vbq::index::INDEX_END_MAGIC; + use byteorder::{ByteOrder, LittleEndian}; + + // Create a writer + let header = FileHeaderBuilder::new().build(); + let mut writer = WriterBuilder::default().header(header).build(Vec::new())?; + + // Write some records + for i in 0..10 { + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(i) + .build()?; + writer.push(record)?; + } + + // Call finish() multiple times + writer.finish()?; + let size_after_first_finish = writer.inner.len(); + + writer.finish()?; + let size_after_second_finish = writer.inner.len(); + + writer.finish()?; + let size_after_third_finish = writer.inner.len(); + + // All sizes should be the same - index should only be written once + assert_eq!(size_after_first_finish, size_after_second_finish); + assert_eq!(size_after_second_finish, size_after_third_finish); + + // Verify only one index magic number at the end + let bytes = &writer.inner; + let magic_offset = bytes.len() - 8; + let magic = LittleEndian::read_u64(&bytes[magic_offset..]); + assert_eq!(magic, INDEX_END_MAGIC); + + Ok(()) + } }