Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/fix-uint8array-comparison.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@tanstack/db": patch
"@tanstack/db-ivm": patch
---

Fix Uint8Array/Buffer comparison to work by content instead of reference. This enables proper equality checks for binary IDs like ULIDs in WHERE clauses using the `eq` function.
47 changes: 36 additions & 11 deletions packages/db-ivm/src/hashing/hash.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ const OBJECT_MARKER = randomHash()
const ARRAY_MARKER = randomHash()
const MAP_MARKER = randomHash()
const SET_MARKER = randomHash()
const UINT8ARRAY_MARKER = randomHash()

// Maximum byte length for Uint8Arrays to hash by content instead of reference
// Arrays smaller than this will be hashed by content, allowing proper equality comparisons
// for small arrays like ULIDs (16 bytes) while still avoiding performance costs for large arrays
const UINT8ARRAY_CONTENT_HASH_THRESHOLD = 128

const hashCache = new WeakMap<object, number>()

Expand All @@ -35,6 +41,24 @@ function hashObject(input: object): number {
let valueHash: number | undefined
if (input instanceof Date) {
valueHash = hashDate(input)
} else if (
// Check if input is a Uint8Array or Buffer
(typeof Buffer !== `undefined` && input instanceof Buffer) ||
input instanceof Uint8Array
) {
// For small Uint8Arrays/Buffers (e.g., ULIDs, UUIDs), hash by content
// to enable proper equality comparisons. For large arrays, hash by reference
// to avoid performance costs.
if (input.byteLength <= UINT8ARRAY_CONTENT_HASH_THRESHOLD) {
valueHash = hashUint8Array(input)
} else {
// Deeply hashing large arrays would be too costly
// so we track them by reference and cache them in a weak map
return cachedReferenceHash(input)
}
} else if (input instanceof File) {
// Files are always hashed by reference due to their potentially large size
return cachedReferenceHash(input)
} else {
let plainObjectInput = input
let marker = OBJECT_MARKER
Expand All @@ -53,17 +77,6 @@ function hashObject(input: object): number {
plainObjectInput = [...input.entries()]
}

if (
(typeof Buffer !== `undefined` && input instanceof Buffer) ||
input instanceof Uint8Array ||
input instanceof File
) {
// Deeply hashing these objects would be too costly
// but we also don't want to ignore them
// so we track them by reference and cache them in a weak map
return cachedReferenceHash(input)
}

valueHash = hashPlainObject(plainObjectInput, marker)
}

Expand All @@ -78,6 +91,18 @@ function hashDate(input: Date): number {
return hasher.digest()
}

function hashUint8Array(input: Uint8Array): number {
const hasher = new MurmurHashStream()
hasher.update(UINT8ARRAY_MARKER)
// Hash the byte length first to differentiate arrays of different sizes
hasher.update(input.byteLength)
// Hash each byte in the array
for (let i = 0; i < input.byteLength; i++) {
hasher.writeByte(input[i]!)
}
return hasher.digest()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can make this method public rather than use the comment to ignore the type error.

}

function hashPlainObject(input: object, marker: number): number {
const hasher = new MurmurHashStream()

Expand Down
30 changes: 15 additions & 15 deletions packages/db-ivm/src/hashing/murmur.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ export class MurmurHashStream implements Hasher {
this.hash = Math.imul(this.hash, 5) + 0xe6546b64
}

private _writeByte(byte: number): void {
writeByte(byte: number): void {
this.carry |= (byte & 0xff) << (8 * this.carryBytes)
this.carryBytes++
this.length++
Expand All @@ -74,29 +74,29 @@ export class MurmurHashStream implements Hasher {

for (let i = 0; i < description.length; i++) {
const code = description.charCodeAt(i)
this._writeByte(code & 0xff)
this._writeByte((code >>> 8) & 0xff)
this.writeByte(code & 0xff)
this.writeByte((code >>> 8) & 0xff)
}
return
}
case `string`:
this.update(STRING_MARKER)
for (let i = 0; i < chunk.length; i++) {
const code = chunk.charCodeAt(i)
this._writeByte(code & 0xff)
this._writeByte((code >>> 8) & 0xff)
this.writeByte(code & 0xff)
this.writeByte((code >>> 8) & 0xff)
}
return
case `number`:
dv.setFloat64(0, chunk, true) // fixed little-endian
this._writeByte(u8[0]!)
this._writeByte(u8[1]!)
this._writeByte(u8[2]!)
this._writeByte(u8[3]!)
this._writeByte(u8[4]!)
this._writeByte(u8[5]!)
this._writeByte(u8[6]!)
this._writeByte(u8[7]!)
this.writeByte(u8[0]!)
this.writeByte(u8[1]!)
this.writeByte(u8[2]!)
this.writeByte(u8[3]!)
this.writeByte(u8[4]!)
this.writeByte(u8[5]!)
this.writeByte(u8[6]!)
this.writeByte(u8[7]!)
return
case `bigint`: {
let value = chunk
Expand All @@ -107,10 +107,10 @@ export class MurmurHashStream implements Hasher {
this.update(BIG_INT_MARKER)
}
while (value > 0n) {
this._writeByte(Number(value & 0xffn))
this.writeByte(Number(value & 0xffn))
value >>= 8n
}
if (chunk === 0n) this._writeByte(0)
if (chunk === 0n) this.writeByte(0)
return
}
default:
Expand Down
43 changes: 40 additions & 3 deletions packages/db-ivm/tests/utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,8 @@ describe(`hash`, () => {
expect(hash4).not.toBe(hash6) // Different Symbol content should have different hash
})

it(`should hash Buffers, Uint8Arrays and File objects by reference`, () => {
it(`should hash small Buffers and Uint8Arrays by content`, () => {
// Small buffers (≤128 bytes) are hashed by content for proper equality comparisons
const buffer1 = Buffer.from([1, 2, 3])
const buffer2 = Buffer.from([1, 2, 3])
const buffer3 = Buffer.from([1, 2, 3, 4])
Expand All @@ -309,7 +310,7 @@ describe(`hash`, () => {
const hash3 = hash(buffer3)

expect(typeof hash1).toBe(hashType)
expect(hash1).not.toBe(hash2) // Same content but different buffer instances have a different hash because it would be too costly to deeply hash buffers
expect(hash1).toBe(hash2) // Same content = same hash for small buffers
expect(hash1).not.toBe(hash3) // Different Buffer content should have different hash
expect(hash1).toBe(hash(buffer1)) // Hashing same buffer should return same hash

Expand All @@ -322,10 +323,46 @@ describe(`hash`, () => {
const hash6 = hash(uint8Array3)

expect(typeof hash4).toBe(hashType)
expect(hash4).not.toBe(hash5) // Same content but different uint8Array instances have a different hash because it would be too costly to deeply hash uint8Arrays
expect(hash4).toBe(hash5) // Same content = same hash for small Uint8Arrays
expect(hash4).not.toBe(hash6) // Different uint8Array content should have different hash
expect(hash4).toBe(hash(uint8Array1)) // Hashing same uint8Array should return same hash
})

it(`should hash large Buffers, Uint8Arrays and File objects by reference`, () => {
// Large buffers (>128 bytes) are hashed by reference to avoid performance costs
const largeBuffer1 = Buffer.alloc(300)
const largeBuffer2 = Buffer.alloc(300)

// Fill with same content
for (let i = 0; i < 300; i++) {
largeBuffer1[i] = i % 256
largeBuffer2[i] = i % 256
}

const hash1 = hash(largeBuffer1)
const hash2 = hash(largeBuffer2)

expect(typeof hash1).toBe(hashType)
expect(hash1).not.toBe(hash2) // Same content but different instances = different hash for large buffers
expect(hash1).toBe(hash(largeBuffer1)) // Hashing same buffer should return same hash

const largeUint8Array1 = new Uint8Array(300)
const largeUint8Array2 = new Uint8Array(300)

// Fill with same content
for (let i = 0; i < 300; i++) {
largeUint8Array1[i] = i % 256
largeUint8Array2[i] = i % 256
}

const hash3 = hash(largeUint8Array1)
const hash4 = hash(largeUint8Array2)

expect(typeof hash3).toBe(hashType)
expect(hash3).not.toBe(hash4) // Same content but different instances = different hash for large Uint8Arrays
expect(hash3).toBe(hash(largeUint8Array1)) // Hashing same uint8Array should return same hash

// Files are always hashed by reference regardless of size
const file1 = new File([`Hello, world!`], `test.txt`)
const file2 = new File([`Hello, world!`], `test.txt`)
const file3 = new File([`Hello, world!`], `test.txt`)
Expand Down
5 changes: 3 additions & 2 deletions packages/db/src/query/compiler/evaluators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import {
UnknownExpressionTypeError,
UnknownFunctionError,
} from "../../errors.js"
import { normalizeValue } from "../../utils/comparison.js"
import { areValuesEqual, normalizeValue } from "../../utils/comparison.js"
import type { BasicExpression, Func, PropRef } from "../ir.js"
import type { NamespacedRow } from "../../types.js"

Expand Down Expand Up @@ -172,7 +172,8 @@ function compileFunction(func: Func, isSingleRow: boolean): (data: any) => any {
if (isUnknown(a) || isUnknown(b)) {
return null
}
return a === b
// Use areValuesEqual for proper Uint8Array/Buffer comparison
return areValuesEqual(a, b)
}
}
case `gt`: {
Expand Down
71 changes: 70 additions & 1 deletion packages/db/src/utils/comparison.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,80 @@ export const defaultComparator = makeComparator({
})

/**
* Normalize a value for comparison
* Compare two Uint8Arrays for content equality
*/
function areUint8ArraysEqual(a: Uint8Array, b: Uint8Array): boolean {
if (a.byteLength !== b.byteLength) {
return false
}
for (let i = 0; i < a.byteLength; i++) {
if (a[i] !== b[i]) {
return false
}
}
return true
}

/**
* Threshold for normalizing Uint8Arrays to string representations.
* Arrays larger than this will use reference equality to avoid memory overhead.
* 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes)
* while avoiding excessive string allocation for large binary data.
*/
const UINT8ARRAY_NORMALIZE_THRESHOLD = 128

/**
* Normalize a value for comparison and Map key usage
* Converts values that can't be directly compared or used as Map keys
* into comparable primitive representations
*/
export function normalizeValue(value: any): any {
if (value instanceof Date) {
return value.getTime()
}

// Normalize Uint8Arrays/Buffers to a string representation for Map key usage
// This enables content-based equality for binary data like ULIDs
const isUint8Array =
(typeof Buffer !== `undefined` && value instanceof Buffer) ||
value instanceof Uint8Array

if (isUint8Array) {
// Only normalize small arrays to avoid memory overhead for large binary data
if (value.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) {
// Convert to a string representation that can be used as a Map key
// Use a special prefix to avoid collisions with user strings
return `__u8__${Array.from(value).join(`,`)}`
}
// For large arrays, fall back to reference equality
// Users working with large binary data should use a derived key if needed
}

return value
}

/**
* Compare two values for equality, with special handling for Uint8Arrays and Buffers
*/
export function areValuesEqual(a: any, b: any): boolean {
// Fast path for reference equality
if (a === b) {
return true
}

// Check for Uint8Array/Buffer comparison
const aIsUint8Array =
(typeof Buffer !== `undefined` && a instanceof Buffer) ||
a instanceof Uint8Array
const bIsUint8Array =
(typeof Buffer !== `undefined` && b instanceof Buffer) ||
b instanceof Uint8Array

// If both are Uint8Arrays, compare by content
if (aIsUint8Array && bIsUint8Array) {
return areUint8ArraysEqual(a, b)
}

// Different types or not Uint8Arrays
return false
}
Loading
Loading