Skip to content

Commit 7ace2ab

Browse files
committed
fix
1 parent 4401379 commit 7ace2ab

File tree

6 files changed

+235
-216
lines changed

6 files changed

+235
-216
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.paimon.vector;
20+
21+
import io.github.jbellis.jvector.graph.OnHeapGraphIndex;
22+
23+
import java.util.List;
24+
25+
/** A shard of the vector index containing row IDs, vectors, metadata, and the graph index. */
26+
public class IndexShard {
27+
final long[] rowIds;
28+
final List<float[]> vectors;
29+
final VectorIndexMetadata metadata;
30+
final long rowRangeStart;
31+
final OnHeapGraphIndex graphIndex;
32+
33+
IndexShard(
34+
long[] rowIds,
35+
List<float[]> vectors,
36+
VectorIndexMetadata metadata,
37+
long rowRangeStart,
38+
OnHeapGraphIndex graphIndex) {
39+
this.rowIds = rowIds;
40+
this.vectors = vectors;
41+
this.metadata = metadata;
42+
this.rowRangeStart = rowRangeStart;
43+
this.graphIndex = graphIndex;
44+
}
45+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.paimon.vector;
20+
21+
import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
22+
import io.github.jbellis.jvector.vector.types.VectorFloat;
23+
24+
import java.util.List;
25+
26+
/** Implementation of RandomAccessVectorValues for Paimon vectors. */
27+
public class PaimonRandomAccessVectorValues implements RandomAccessVectorValues {
28+
private final int size;
29+
private final int dimension;
30+
private final List<VectorFloat<?>> vectors;
31+
private final boolean isValueShared;
32+
33+
public PaimonRandomAccessVectorValues(
34+
int size, int dimension, List<VectorFloat<?>> vectors, boolean isValueShared) {
35+
this.size = size;
36+
this.dimension = dimension;
37+
this.vectors = vectors;
38+
this.isValueShared = isValueShared;
39+
}
40+
41+
@Override
42+
public int size() {
43+
return this.size;
44+
}
45+
46+
@Override
47+
public int dimension() {
48+
return this.dimension;
49+
}
50+
51+
@Override
52+
public VectorFloat<?> getVector(int i) {
53+
return this.vectors.get(i);
54+
}
55+
56+
@Override
57+
public boolean isValueShared() {
58+
return this.isValueShared;
59+
}
60+
61+
@Override
62+
public RandomAccessVectorValues copy() {
63+
return this;
64+
}
65+
}

paimon-vector/src/main/java/org/apache/paimon/vector/VectorGlobalIndexReader.java

Lines changed: 29 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929

3030
import io.github.jbellis.jvector.graph.GraphIndexBuilder;
3131
import io.github.jbellis.jvector.graph.GraphSearcher;
32-
import io.github.jbellis.jvector.graph.OnHeapGraphIndex;
3332
import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
3433
import io.github.jbellis.jvector.graph.SearchResult;
3534
import io.github.jbellis.jvector.util.Bits;
@@ -122,16 +121,9 @@ private IndexShard deserializeIndex(byte[] indexBytes, GlobalIndexIOMeta meta)
122121

123122
// Parse metadata
124123
byte[] metaBytes = meta.metadata();
125-
VectorIndexMetadata metadata = deserializeMetadata(metaBytes);
124+
VectorIndexMetadata metadata = VectorIndexMetadata.deserializeMetadata(metaBytes);
126125

127-
// Rebuild graph index if version 2 (with JVector)
128-
OnHeapGraphIndex graphIndex = null;
129-
if (version == 2 && dataIn.available() > 0) {
130-
// Read graph structure
131-
int graphSize = dataIn.readInt();
132-
int maxDegree = dataIn.readInt();
133-
134-
// Rebuild the graph index from vectors
126+
if (dataIn.available() > 0) {
135127
var vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
136128
List<VectorFloat<?>> vectorFloats = new ArrayList<>();
137129
for (float[] vec : vectors) {
@@ -140,64 +132,29 @@ private IndexShard deserializeIndex(byte[] indexBytes, GlobalIndexIOMeta meta)
140132

141133
// Create RandomAccessVectorValues
142134
RandomAccessVectorValues vectorValues =
143-
new RandomAccessVectorValues() {
144-
@Override
145-
public int size() {
146-
return vectorFloats.size();
147-
}
148-
149-
@Override
150-
public int dimension() {
151-
return metadata.dimension;
152-
}
153-
154-
@Override
155-
public VectorFloat<?> getVector(int i) {
156-
return vectorFloats.get(i);
157-
}
158-
159-
@Override
160-
public boolean isValueShared() {
161-
return false;
162-
}
163-
164-
@Override
165-
public RandomAccessVectorValues copy() {
166-
return this;
167-
}
168-
};
135+
new PaimonRandomAccessVectorValues(
136+
vectorFloats.size(), metadata.dimension(), vectorFloats, false);
169137

170138
// Rebuild graph using stored structure or rebuild from scratch
171139
VectorSimilarityFunction similarityFunction =
172-
VectorSimilarityFunction.valueOf(metadata.similarityFunction);
173-
var builder =
140+
VectorSimilarityFunction.valueOf(metadata.similarityFunction());
141+
try (GraphIndexBuilder builder =
174142
new GraphIndexBuilder(
175143
vectorValues,
176144
similarityFunction,
177-
metadata.m,
178-
metadata.efConstruction,
145+
metadata.m(),
146+
metadata.efConstruction(),
179147
1.0f, // todo: need conf
180-
1.0f); // todo: need conf
181-
graphIndex = builder.build(vectorValues);
148+
1.0f)) { // todo: need conf
149+
return new IndexShard(
150+
rowIds,
151+
vectors,
152+
metadata,
153+
meta.rowIdRange().from,
154+
builder.build(vectorValues));
155+
}
182156
}
183-
184-
return new IndexShard(rowIds, vectors, metadata, meta.rowIdRange().from, graphIndex);
185-
}
186-
}
187-
188-
private VectorIndexMetadata deserializeMetadata(byte[] metaBytes) throws IOException {
189-
if (metaBytes == null || metaBytes.length == 0) {
190-
return new VectorIndexMetadata(
191-
this.dimension, this.similarityFunction, this.m, this.efConstruction);
192-
}
193-
194-
ByteArrayInputStream byteIn = new ByteArrayInputStream(metaBytes);
195-
try (DataInputStream dataIn = new DataInputStream(byteIn)) {
196-
int dimension = dataIn.readInt();
197-
String metricName = dataIn.readUTF();
198-
int m = dataIn.readInt();
199-
int efConstruction = dataIn.readInt();
200-
return new VectorIndexMetadata(dimension, metricName, m, efConstruction);
157+
return new IndexShard(rowIds, vectors, metadata, meta.rowIdRange().from, null);
201158
}
202159
}
203160

@@ -222,43 +179,21 @@ public GlobalIndexResult search(float[] query, int k) {
222179

223180
// Use JVector's GraphSearcher for efficient ANN search
224181
VectorSimilarityFunction similarityFunction =
225-
VectorSimilarityFunction.valueOf(shard.metadata.similarityFunction);
182+
VectorSimilarityFunction.valueOf(shard.metadata.similarityFunction());
226183

227184
// Create vector values for search context
228-
var vectorTypeSupport2 = VectorizationProvider.getInstance().getVectorTypeSupport();
185+
var vectorTypeSupportForIndex =
186+
VectorizationProvider.getInstance().getVectorTypeSupport();
229187
List<VectorFloat<?>> vectorFloats = new ArrayList<>();
230188
for (float[] vec : shard.vectors) {
231-
vectorFloats.add(vectorTypeSupport2.createFloatVector(vec));
189+
vectorFloats.add(vectorTypeSupportForIndex.createFloatVector(vec));
232190
}
233-
234191
RandomAccessVectorValues vectorValues =
235-
new RandomAccessVectorValues() {
236-
@Override
237-
public int size() {
238-
return vectorFloats.size();
239-
}
240-
241-
@Override
242-
public int dimension() {
243-
return shard.metadata.dimension;
244-
}
245-
246-
@Override
247-
public VectorFloat<?> getVector(int i) {
248-
return vectorFloats.get(i);
249-
}
250-
251-
@Override
252-
public boolean isValueShared() {
253-
return false;
254-
}
255-
256-
@Override
257-
public RandomAccessVectorValues copy() {
258-
return this;
259-
}
260-
};
261-
192+
new PaimonRandomAccessVectorValues(
193+
vectorFloats.size(),
194+
shard.metadata.dimension(),
195+
vectorFloats,
196+
false);
262197
// Search using static method
263198
SearchResult result =
264199
GraphSearcher.search(
@@ -272,7 +207,9 @@ public RandomAccessVectorValues copy() {
272207
// Collect row IDs from results
273208
var nodes = result.getNodes();
274209
for (int i = 0; i < nodes.length && i < k; i++) {
275-
int nodeId = nodes[i].node;
210+
// todo: could get score here: nodeScore.score
211+
SearchResult.NodeScore nodeScore = nodes[i];
212+
int nodeId = nodeScore.node;
276213
if (nodeId >= 0 && nodeId < shard.rowIds.length) {
277214
resultIds.add(shard.rowIds[nodeId]);
278215
}
@@ -360,39 +297,4 @@ public Optional<GlobalIndexResult> visitIn(FieldRef fieldRef, List<Object> liter
360297
public Optional<GlobalIndexResult> visitNotIn(FieldRef fieldRef, List<Object> literals) {
361298
throw new UnsupportedOperationException("Vector index does not support notIn predicate");
362299
}
363-
364-
private static class IndexShard {
365-
final long[] rowIds;
366-
final List<float[]> vectors;
367-
final VectorIndexMetadata metadata;
368-
final long rowRangeStart;
369-
final OnHeapGraphIndex graphIndex;
370-
371-
IndexShard(
372-
long[] rowIds,
373-
List<float[]> vectors,
374-
VectorIndexMetadata metadata,
375-
long rowRangeStart,
376-
OnHeapGraphIndex graphIndex) {
377-
this.rowIds = rowIds;
378-
this.vectors = vectors;
379-
this.metadata = metadata;
380-
this.rowRangeStart = rowRangeStart;
381-
this.graphIndex = graphIndex;
382-
}
383-
}
384-
385-
private static class VectorIndexMetadata {
386-
final int dimension;
387-
final String similarityFunction;
388-
final int m;
389-
final int efConstruction;
390-
391-
VectorIndexMetadata(int dimension, String similarityFunction, int m, int efConstruction) {
392-
this.dimension = dimension;
393-
this.similarityFunction = similarityFunction;
394-
this.m = m;
395-
this.efConstruction = efConstruction;
396-
}
397-
}
398300
}

paimon-vector/src/main/java/org/apache/paimon/vector/VectorGlobalIndexWriter.java

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,15 @@ public List<ResultEntry> finish() {
124124

125125
// Serialize index and metadata
126126
byte[] indexBytes = serializeIndexWithJVector(graphIndex);
127-
byte[] metaBytes = serializeMetadata();
128-
129-
// Write to file. todo: here could use OnDiskGraphIndex to write directly to file.
130-
// RandomAccessVectorValues ravv
131-
// OnDiskGraphIndex.write(index, ravv, indexPath);
127+
VectorIndexMetadata metadata =
128+
new VectorIndexMetadata(
129+
vectorOptions.dimension(),
130+
vectorOptions.metric(),
131+
vectorOptions.m(),
132+
vectorOptions.efConstruction());
133+
byte[] metaBytes = VectorIndexMetadata.serializeMetadata(metadata);
134+
135+
// Write to file.
132136
String fileName = fileWriter.newFileName(VectorGlobalIndexerFactory.IDENTIFIER);
133137
try (OutputStream out = fileWriter.newOutputStream(fileName)) {
134138
out.write(indexBytes);
@@ -154,32 +158,8 @@ private OnHeapGraphIndex buildJVectorIndex() {
154158

155159
// Create vector values adapter for JVector
156160
RandomAccessVectorValues vectorValues =
157-
new RandomAccessVectorValues() {
158-
@Override
159-
public int size() {
160-
return vectorFloats.size();
161-
}
162-
163-
@Override
164-
public int dimension() {
165-
return vectorOptions.dimension();
166-
}
167-
168-
@Override
169-
public VectorFloat<?> getVector(int i) {
170-
return vectorFloats.get(i);
171-
}
172-
173-
@Override
174-
public boolean isValueShared() {
175-
return false;
176-
}
177-
178-
@Override
179-
public RandomAccessVectorValues copy() {
180-
return this;
181-
}
182-
};
161+
new PaimonRandomAccessVectorValues(
162+
vectorFloats.size(), vectorOptions.dimension(), vectorFloats, false);
183163

184164
// Build HNSW graph index using JVector
185165
GraphIndexBuilder builder =
@@ -199,7 +179,7 @@ private byte[] serializeIndexWithJVector(OnHeapGraphIndex graphIndex) throws IOE
199179
ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
200180
try (DataOutputStream dataOut = new DataOutputStream(byteOut)) {
201181
// Write version
202-
dataOut.writeInt(2); // Version 2 with JVector index
182+
dataOut.writeInt(1); // Version 1 with JVector index
203183

204184
// Write row IDs
205185
dataOut.writeInt(vectors.size());

0 commit comments

Comments
 (0)