Hash small Uint8Arrays (≤128 bytes) by content rather than reference (#779)

KyleAMathews · claude · web-flow · commit 7aedf12996a6 · 2025-11-10T11:17:46.000-07:00
* fix: compare Uint8Arrays by content for proper binary ID equality

Fixes `eq` function and hash indexing to compare Uint8Arrays/Buffers by content
instead of reference, enabling proper ULID comparisons in WHERE clauses.

Changes:
- Hash small Uint8Arrays (≤128 bytes) by content in db-ivm for better indexing
- Compare Uint8Arrays by content in eq operator via areValuesEqual() function
- Add comprehensive tests for Uint8Array equality comparison

* test: add tests for Uint8Array equality with zero-filled arrays

Add tests that specifically cover the user's reproduction case where
Uint8Arrays are created with a length (e.g., new Uint8Array(5)) resulting
in zero-filled arrays. Confirms that content comparison works correctly.

* test: add tests for primitive equality to verify no regression

Add explicit tests for string and number equality to ensure that the
areValuesEqual function doesn't break primitive comparisons. All tests
pass, confirming the implementation correctly handles both Uint8Arrays
and primitives.

* fix: compare Uint8Arrays by content for proper binary ID equality

Fixes  function and hash indexing to compare Uint8Arrays/Buffers by content
instead of reference, enabling proper ULID comparisons in WHERE clauses.

The issue was that  used  which compares Uint8Arrays by reference.
Now it uses  which compares Uint8Arrays byte-by-byte.

Changes:
- Hash small Uint8Arrays (≤128 bytes) by content in db-ivm for better indexing
- Compare Uint8Arrays by content in eq operator via areValuesEqual() function
- Made writeByte() public in MurmurHashStream
- Add comprehensive tests for Uint8Array equality comparison
- Add integration test reproducing the user's exact scenario

All tests pass (84/84 evaluator tests, 1/1 integration test).

* fix: normalize Uint8Arrays for Map key usage in indexes

The previous fix handled Uint8Array comparison at the expression
evaluation level, but index lookups still failed because JavaScript
Maps use reference equality for object keys.

Updated normalizeValue() to convert Uint8Arrays/Buffers to string
representations that can be used as Map keys with content-based
equality. This enables proper index lookups for binary IDs like
ULIDs when auto-indexing is enabled (the default behavior).

Also updated the integration test to verify the fix works with
auto-indexing enabled.

* fix: add 128-byte threshold to prevent large Uint8Array string duplication

Applied the same 128-byte threshold to normalizeValue() as used in
the hashing function. This prevents creating giant strings in memory
when indexing large Uint8Arrays (&gt; 128 bytes).

Arrays larger than 128 bytes will fall back to reference equality,
which is acceptable as the fix is primarily for ID use cases (ULIDs
are 16 bytes, UUIDs are 16 bytes).

Added test coverage to verify the threshold behavior works as expected.

---------

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/.changeset/fix-uint8array-comparison.md b/.changeset/fix-uint8array-comparison.md
@@ -0,0 +1,6 @@
+---
+"@tanstack/db": patch
+"@tanstack/db-ivm": patch
+---
+
+Fix Uint8Array/Buffer comparison to work by content instead of reference. This enables proper equality checks for binary IDs like ULIDs in WHERE clauses using the `eq` function.
diff --git a/packages/db-ivm/src/hashing/hash.ts b/packages/db-ivm/src/hashing/hash.ts
@@ -17,6 +17,12 @@ const OBJECT_MARKER = randomHash()
 const ARRAY_MARKER = randomHash()
 const MAP_MARKER = randomHash()
 const SET_MARKER = randomHash()
+const UINT8ARRAY_MARKER = randomHash()
+
+// Maximum byte length for Uint8Arrays to hash by content instead of reference
+// Arrays smaller than this will be hashed by content, allowing proper equality comparisons
+// for small arrays like ULIDs (16 bytes) while still avoiding performance costs for large arrays
+const UINT8ARRAY_CONTENT_HASH_THRESHOLD = 128
 
 const hashCache = new WeakMap<object, number>()
 
@@ -35,6 +41,24 @@ function hashObject(input: object): number {
   let valueHash: number | undefined
   if (input instanceof Date) {
     valueHash = hashDate(input)
+  } else if (
+    // Check if input is a Uint8Array or Buffer
+    (typeof Buffer !== `undefined` && input instanceof Buffer) ||
+    input instanceof Uint8Array
+  ) {
+    // For small Uint8Arrays/Buffers (e.g., ULIDs, UUIDs), hash by content
+    // to enable proper equality comparisons. For large arrays, hash by reference
+    // to avoid performance costs.
+    if (input.byteLength <= UINT8ARRAY_CONTENT_HASH_THRESHOLD) {
+      valueHash = hashUint8Array(input)
+    } else {
+      // Deeply hashing large arrays would be too costly
+      // so we track them by reference and cache them in a weak map
+      return cachedReferenceHash(input)
+    }
+  } else if (input instanceof File) {
+    // Files are always hashed by reference due to their potentially large size
+    return cachedReferenceHash(input)
   } else {
     let plainObjectInput = input
     let marker = OBJECT_MARKER
@@ -53,17 +77,6 @@ function hashObject(input: object): number {
       plainObjectInput = [...input.entries()]
     }
 
-    if (
-      (typeof Buffer !== `undefined` && input instanceof Buffer) ||
-      input instanceof Uint8Array ||
-      input instanceof File
-    ) {
-      // Deeply hashing these objects would be too costly
-      // but we also don't want to ignore them
-      // so we track them by reference and cache them in a weak map
-      return cachedReferenceHash(input)
-    }
-
     valueHash = hashPlainObject(plainObjectInput, marker)
   }
 
@@ -78,6 +91,18 @@ function hashDate(input: Date): number {
   return hasher.digest()
 }
 
+function hashUint8Array(input: Uint8Array): number {
+  const hasher = new MurmurHashStream()
+  hasher.update(UINT8ARRAY_MARKER)
+  // Hash the byte length first to differentiate arrays of different sizes
+  hasher.update(input.byteLength)
+  // Hash each byte in the array
+  for (let i = 0; i < input.byteLength; i++) {
+    hasher.writeByte(input[i]!)
+  }
+  return hasher.digest()
+}
+
 function hashPlainObject(input: object, marker: number): number {
   const hasher = new MurmurHashStream()
 
diff --git a/packages/db-ivm/src/hashing/murmur.ts b/packages/db-ivm/src/hashing/murmur.ts
@@ -51,7 +51,7 @@ export class MurmurHashStream implements Hasher {
     this.hash = Math.imul(this.hash, 5) + 0xe6546b64
   }
 
-  private _writeByte(byte: number): void {
+  writeByte(byte: number): void {
     this.carry |= (byte & 0xff) << (8 * this.carryBytes)
     this.carryBytes++
     this.length++
@@ -74,29 +74,29 @@ export class MurmurHashStream implements Hasher {
 
         for (let i = 0; i < description.length; i++) {
           const code = description.charCodeAt(i)
-          this._writeByte(code & 0xff)
-          this._writeByte((code >>> 8) & 0xff)
+          this.writeByte(code & 0xff)
+          this.writeByte((code >>> 8) & 0xff)
         }
         return
       }
       case `string`:
         this.update(STRING_MARKER)
         for (let i = 0; i < chunk.length; i++) {
           const code = chunk.charCodeAt(i)
-          this._writeByte(code & 0xff)
-          this._writeByte((code >>> 8) & 0xff)
+          this.writeByte(code & 0xff)
+          this.writeByte((code >>> 8) & 0xff)
         }
         return
       case `number`:
         dv.setFloat64(0, chunk, true) // fixed little-endian
-        this._writeByte(u8[0]!)
-        this._writeByte(u8[1]!)
-        this._writeByte(u8[2]!)
-        this._writeByte(u8[3]!)
-        this._writeByte(u8[4]!)
-        this._writeByte(u8[5]!)
-        this._writeByte(u8[6]!)
-        this._writeByte(u8[7]!)
+        this.writeByte(u8[0]!)
+        this.writeByte(u8[1]!)
+        this.writeByte(u8[2]!)
+        this.writeByte(u8[3]!)
+        this.writeByte(u8[4]!)
+        this.writeByte(u8[5]!)
+        this.writeByte(u8[6]!)
+        this.writeByte(u8[7]!)
         return
       case `bigint`: {
         let value = chunk
@@ -107,10 +107,10 @@ export class MurmurHashStream implements Hasher {
           this.update(BIG_INT_MARKER)
         }
         while (value > 0n) {
-          this._writeByte(Number(value & 0xffn))
+          this.writeByte(Number(value & 0xffn))
           value >>= 8n
         }
-        if (chunk === 0n) this._writeByte(0)
+        if (chunk === 0n) this.writeByte(0)
         return
       }
       default:
diff --git a/packages/db-ivm/tests/utils.test.ts b/packages/db-ivm/tests/utils.test.ts
@@ -299,7 +299,8 @@ describe(`hash`, () => {
       expect(hash4).not.toBe(hash6) // Different Symbol content should have different hash
     })
 
-    it(`should hash Buffers, Uint8Arrays and File objects by reference`, () => {
+    it(`should hash small Buffers and Uint8Arrays by content`, () => {
+      // Small buffers (≤128 bytes) are hashed by content for proper equality comparisons
       const buffer1 = Buffer.from([1, 2, 3])
       const buffer2 = Buffer.from([1, 2, 3])
       const buffer3 = Buffer.from([1, 2, 3, 4])
@@ -309,7 +310,7 @@ describe(`hash`, () => {
       const hash3 = hash(buffer3)
 
       expect(typeof hash1).toBe(hashType)
-      expect(hash1).not.toBe(hash2) // Same content but different buffer instances have a different hash because it would be too costly to deeply hash buffers
+      expect(hash1).toBe(hash2) // Same content = same hash for small buffers
       expect(hash1).not.toBe(hash3) // Different Buffer content should have different hash
       expect(hash1).toBe(hash(buffer1)) // Hashing same buffer should return same hash
 
@@ -322,10 +323,46 @@ describe(`hash`, () => {
       const hash6 = hash(uint8Array3)
 
       expect(typeof hash4).toBe(hashType)
-      expect(hash4).not.toBe(hash5) // Same content but different uint8Array instances have a different hash because it would be too costly to deeply hash uint8Arrays
+      expect(hash4).toBe(hash5) // Same content = same hash for small Uint8Arrays
       expect(hash4).not.toBe(hash6) // Different uint8Array content should have different hash
       expect(hash4).toBe(hash(uint8Array1)) // Hashing same uint8Array should return same hash
+    })
+
+    it(`should hash large Buffers, Uint8Arrays and File objects by reference`, () => {
+      // Large buffers (>128 bytes) are hashed by reference to avoid performance costs
+      const largeBuffer1 = Buffer.alloc(300)
+      const largeBuffer2 = Buffer.alloc(300)
+
+      // Fill with same content
+      for (let i = 0; i < 300; i++) {
+        largeBuffer1[i] = i % 256
+        largeBuffer2[i] = i % 256
+      }
+
+      const hash1 = hash(largeBuffer1)
+      const hash2 = hash(largeBuffer2)
+
+      expect(typeof hash1).toBe(hashType)
+      expect(hash1).not.toBe(hash2) // Same content but different instances = different hash for large buffers
+      expect(hash1).toBe(hash(largeBuffer1)) // Hashing same buffer should return same hash
+
+      const largeUint8Array1 = new Uint8Array(300)
+      const largeUint8Array2 = new Uint8Array(300)
+
+      // Fill with same content
+      for (let i = 0; i < 300; i++) {
+        largeUint8Array1[i] = i % 256
+        largeUint8Array2[i] = i % 256
+      }
+
+      const hash3 = hash(largeUint8Array1)
+      const hash4 = hash(largeUint8Array2)
+
+      expect(typeof hash3).toBe(hashType)
+      expect(hash3).not.toBe(hash4) // Same content but different instances = different hash for large Uint8Arrays
+      expect(hash3).toBe(hash(largeUint8Array1)) // Hashing same uint8Array should return same hash
 
+      // Files are always hashed by reference regardless of size
       const file1 = new File([`Hello, world!`], `test.txt`)
       const file2 = new File([`Hello, world!`], `test.txt`)
       const file3 = new File([`Hello, world!`], `test.txt`)
diff --git a/packages/db/src/query/compiler/evaluators.ts b/packages/db/src/query/compiler/evaluators.ts
@@ -3,7 +3,7 @@ import {
   UnknownExpressionTypeError,
   UnknownFunctionError,
 } from "../../errors.js"
-import { normalizeValue } from "../../utils/comparison.js"
+import { areValuesEqual, normalizeValue } from "../../utils/comparison.js"
 import type { BasicExpression, Func, PropRef } from "../ir.js"
 import type { NamespacedRow } from "../../types.js"
 
@@ -172,7 +172,8 @@ function compileFunction(func: Func, isSingleRow: boolean): (data: any) => any {
         if (isUnknown(a) || isUnknown(b)) {
           return null
         }
-        return a === b
+        // Use areValuesEqual for proper Uint8Array/Buffer comparison
+        return areValuesEqual(a, b)
       }
     }
     case `gt`: {
diff --git a/packages/db/src/utils/comparison.ts b/packages/db/src/utils/comparison.ts
@@ -112,11 +112,80 @@ export const defaultComparator = makeComparator({
 })
 
 /**
- * Normalize a value for comparison
+ * Compare two Uint8Arrays for content equality
+ */
+function areUint8ArraysEqual(a: Uint8Array, b: Uint8Array): boolean {
+  if (a.byteLength !== b.byteLength) {
+    return false
+  }
+  for (let i = 0; i < a.byteLength; i++) {
+    if (a[i] !== b[i]) {
+      return false
+    }
+  }
+  return true
+}
+
+/**
+ * Threshold for normalizing Uint8Arrays to string representations.
+ * Arrays larger than this will use reference equality to avoid memory overhead.
+ * 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes)
+ * while avoiding excessive string allocation for large binary data.
+ */
+const UINT8ARRAY_NORMALIZE_THRESHOLD = 128
+
+/**
+ * Normalize a value for comparison and Map key usage
+ * Converts values that can't be directly compared or used as Map keys
+ * into comparable primitive representations
  */
 export function normalizeValue(value: any): any {
   if (value instanceof Date) {
     return value.getTime()
   }
+
+  // Normalize Uint8Arrays/Buffers to a string representation for Map key usage
+  // This enables content-based equality for binary data like ULIDs
+  const isUint8Array =
+    (typeof Buffer !== `undefined` && value instanceof Buffer) ||
+    value instanceof Uint8Array
+
+  if (isUint8Array) {
+    // Only normalize small arrays to avoid memory overhead for large binary data
+    if (value.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) {
+      // Convert to a string representation that can be used as a Map key
+      // Use a special prefix to avoid collisions with user strings
+      return `__u8__${Array.from(value).join(`,`)}`
+    }
+    // For large arrays, fall back to reference equality
+    // Users working with large binary data should use a derived key if needed
+  }
+
   return value
 }
+
+/**
+ * Compare two values for equality, with special handling for Uint8Arrays and Buffers
+ */
+export function areValuesEqual(a: any, b: any): boolean {
+  // Fast path for reference equality
+  if (a === b) {
+    return true
+  }
+
+  // Check for Uint8Array/Buffer comparison
+  const aIsUint8Array =
+    (typeof Buffer !== `undefined` && a instanceof Buffer) ||
+    a instanceof Uint8Array
+  const bIsUint8Array =
+    (typeof Buffer !== `undefined` && b instanceof Buffer) ||
+    b instanceof Uint8Array
+
+  // If both are Uint8Arrays, compare by content
+  if (aIsUint8Array && bIsUint8Array) {
+    return areUint8ArraysEqual(a, b)
+  }
+
+  // Different types or not Uint8Arrays
+  return false
+}
diff --git a/packages/db/tests/integration/uint8array-id-comparison.test.ts b/packages/db/tests/integration/uint8array-id-comparison.test.ts
diff --git a/packages/db/tests/query/compiler/evaluators.test.ts b/packages/db/tests/query/compiler/evaluators.test.ts

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ import {`
`3`	`3`	`UnknownExpressionTypeError,`
`4`	`4`	`UnknownFunctionError,`
`5`	`5`	`} from "../../errors.js"`
`6`		`-import { normalizeValue } from "../../utils/comparison.js"`
	`6`	`+import { areValuesEqual, normalizeValue } from "../../utils/comparison.js"`
`7`	`7`	`import type { BasicExpression, Func, PropRef } from "../ir.js"`
`8`	`8`	`import type { NamespacedRow } from "../../types.js"`
`9`	`9`
`@@ -172,7 +172,8 @@ function compileFunction(func: Func, isSingleRow: boolean): (data: any) => any {`
`172`	`172`	`if (isUnknown(a) \|\| isUnknown(b)) {`
`173`	`173`	`return null`
`174`	`174`	`}`
`175`		`- return a === b`
	`175`	`+ // Use areValuesEqual for proper Uint8Array/Buffer comparison`
	`176`	`+ return areValuesEqual(a, b)`
`176`	`177`	`}`
`177`	`178`	`}`
`178`	`179`	case `gt`: {