forked from hw-native-sys/simpler
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathruntime_bindings.py
More file actions
343 lines (247 loc) · 9.57 KB
/
runtime_bindings.py
File metadata and controls
343 lines (247 loc) · 9.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
"""
PTO Runtime ctypes Bindings
Provides a Pythonic interface to the PTO runtime via ctypes.
Users must provide a pre-compiled libpto_runtime.so (built via binary_compiler.py).
Usage:
from runtime_bindings import load_runtime, register_kernel, launch_runtime
Runtime = load_runtime("/path/to/libpto_runtime.so")
runtime = Runtime()
runtime.initialize()
register_kernel(0, kernel_add)
register_kernel(1, kernel_add_scalar)
register_kernel(2, kernel_mul)
launch_runtime(runtime, aicpu_thread_num=1, block_dim=1,
device_id=0, aicpu_binary=aicpu_bytes,
aicore_binary=aicore_bytes)
runtime.finalize()
"""
from ctypes import (
CDLL,
POINTER,
c_int,
c_void_p,
c_uint8,
c_size_t,
)
from pathlib import Path
from typing import Union
import ctypes
import tempfile
# Module-level library reference
_lib = None
# ============================================================================
# Runtime Library Loader
# ============================================================================
class RuntimeLibraryLoader:
"""Loads and manages the PTO runtime C API library."""
def __init__(self, lib_path: Union[str, Path]):
"""
Load the PTO runtime library.
Args:
lib_path: Path to libpto_runtime.so
Raises:
FileNotFoundError: If library file not found
OSError: If library cannot be loaded
"""
lib_path = Path(lib_path)
if not lib_path.exists():
raise FileNotFoundError(f"Library not found: {lib_path}")
self.lib_path = lib_path
self.lib = CDLL(str(lib_path))
self._setup_functions()
def _setup_functions(self):
"""Set up ctypes function signatures."""
# GetRuntimeSize - returns sizeof(Runtime) for user allocation
self.lib.GetRuntimeSize.argtypes = []
self.lib.GetRuntimeSize.restype = c_size_t
# InitRuntime - placement new + build runtime
self.lib.InitRuntime.argtypes = [c_void_p]
self.lib.InitRuntime.restype = c_int
# launch_runtime - device init + execute runtime
self.lib.launch_runtime.argtypes = [
c_void_p, # runtime
c_int, # aicpu_thread_num
c_int, # block_dim
c_int, # device_id
POINTER(c_uint8), # aicpu_binary
c_size_t, # aicpu_size
POINTER(c_uint8), # aicore_binary
c_size_t, # aicore_size
]
self.lib.launch_runtime.restype = c_int
# FinalizeRuntime - validate + cleanup
self.lib.FinalizeRuntime.argtypes = [c_void_p]
self.lib.FinalizeRuntime.restype = c_int
# RegisterKernel - register kernel binary for func_id
self.lib.RegisterKernel.argtypes = [c_int, POINTER(c_uint8), c_size_t]
self.lib.RegisterKernel.restype = c_int
# set_device - set device and create streams
self.lib.set_device.argtypes = [c_int]
self.lib.set_device.restype = c_int
# ============================================================================
# Python Wrapper Classes
# ============================================================================
class Runtime:
"""
Task dependency runtime.
Python wrapper around the C Runtime API.
User allocates memory via ctypes buffer, C++ uses placement new.
"""
def __init__(self, lib: CDLL):
"""
Create a new runtime handle.
Args:
lib: Loaded ctypes library (RuntimeLibraryLoader.lib)
"""
self.lib = lib
# Allocate buffer of size GetRuntimeSize() for placement new
size = lib.GetRuntimeSize()
self._buffer = ctypes.create_string_buffer(size)
self._handle = ctypes.cast(self._buffer, c_void_p)
def initialize(self) -> None:
"""
Initialize the runtime structure.
Calls InitRuntime() in C++ which uses placement new to construct
the Runtime, builds tasks, allocates device tensors, and initializes data.
Raises:
RuntimeError: If initialization fails
"""
rc = self.lib.InitRuntime(self._handle)
if rc != 0:
raise RuntimeError(f"InitRuntime failed: {rc}")
def finalize(self) -> None:
"""
Finalize and cleanup the runtime.
Calls FinalizeRuntime() in C++ which validates computation results,
frees device tensors, and calls the Runtime destructor.
Raises:
RuntimeError: If finalization fails
"""
rc = self.lib.FinalizeRuntime(self._handle)
if rc != 0:
raise RuntimeError(f"FinalizeRuntime failed: {rc}")
def __del__(self):
"""Clean up runtime resources."""
# Runtime destructor is called by finalize(), buffer freed by Python GC
pass
# ============================================================================
# Module-level Functions
# ============================================================================
def register_kernel(func_id: int, binary_data: bytes) -> None:
"""
Register a kernel binary for a func_id.
Receives pre-extracted .text section binary data,
allocates device GM memory, copies the binary to device,
and stores the GM address for later use by launch_runtime().
Args:
func_id: Function identifier (0, 1, 2, ...)
binary_data: Kernel .text section binary data
Raises:
RuntimeError: If not initialized or registration fails
ValueError: If binary_data is empty
"""
global _lib
if _lib is None:
raise RuntimeError("Runtime not loaded. Call load_runtime() first.")
if not binary_data:
raise ValueError("binary_data cannot be empty")
# Convert bytes to ctypes array
bin_array = (c_uint8 * len(binary_data)).from_buffer_copy(binary_data)
rc = _lib.RegisterKernel(func_id, bin_array, len(binary_data))
if rc != 0:
raise RuntimeError(f"RegisterKernel failed: {rc}")
def set_device(device_id: int) -> None:
"""
Set device and create streams for memory operations.
Must be called before runtime.initialize() to enable device tensor allocation.
Only performs minimal initialization:
- rtSetDevice(device_id)
- Create AICPU and AICore streams
Binary loading happens later in launch_runtime().
Args:
device_id: Device ID (0-15)
Raises:
RuntimeError: If not loaded or device setup fails
"""
global _lib
if _lib is None:
raise RuntimeError("Runtime not loaded. Call load_runtime() first.")
rc = _lib.set_device(device_id)
if rc != 0:
raise RuntimeError(f"set_device failed: {rc}")
def launch_runtime(
runtime: "Runtime",
aicpu_thread_num: int,
block_dim: int,
device_id: int,
aicpu_binary: bytes,
aicore_binary: bytes,
) -> None:
"""
Execute a runtime on the device.
Initializes DeviceRunner singleton (if first call), copies runtime to device,
launches kernels, synchronizes, and copies runtime back from device.
Args:
runtime: Runtime to execute (must have been initialized via runtime.initialize())
aicpu_thread_num: Number of AICPU scheduler threads
block_dim: Number of blocks (1 block = 1 AIC + 2 AIV)
device_id: Device ID (0-15)
aicpu_binary: Binary data of AICPU shared object
aicore_binary: Binary data of AICore kernel
Raises:
RuntimeError: If not initialized or execution fails
"""
global _lib
if _lib is None:
raise RuntimeError("Runtime not loaded. Call load_runtime() first.")
# Convert bytes to ctypes arrays
aicpu_array = (c_uint8 * len(aicpu_binary)).from_buffer_copy(aicpu_binary)
aicore_array = (c_uint8 * len(aicore_binary)).from_buffer_copy(aicore_binary)
rc = _lib.launch_runtime(
runtime._handle,
aicpu_thread_num,
block_dim,
device_id,
aicpu_array,
len(aicpu_binary),
aicore_array,
len(aicore_binary),
)
if rc != 0:
raise RuntimeError(f"launch_runtime failed: {rc}")
# ============================================================================
# Public API
# ============================================================================
def load_runtime(lib_path: Union[str, Path, bytes]) -> type:
"""
Load the PTO runtime library and return Runtime class.
Args:
lib_path: Path to libpto_runtime.so (str/Path), or compiled binary data (bytes)
Returns:
Runtime class initialized with the library
Example:
from runtime_bindings import load_runtime, register_kernel, launch_runtime
Runtime = load_runtime("/path/to/libpto_runtime.so")
runtime = Runtime()
runtime.initialize()
register_kernel(0, kernel_add)
register_kernel(1, kernel_add_scalar)
register_kernel(2, kernel_mul)
launch_runtime(runtime, aicpu_thread_num=1, block_dim=1,
device_id=0, aicpu_binary=aicpu_bytes,
aicore_binary=aicore_bytes)
runtime.finalize()
"""
global _lib
# If bytes are provided, write to temporary file
if isinstance(lib_path, bytes):
with tempfile.NamedTemporaryFile(delete=False, suffix='.so') as f:
f.write(lib_path)
lib_path = f.name
loader = RuntimeLibraryLoader(lib_path)
_lib = loader.lib
# Create wrapper class with the loaded library
class _Runtime(Runtime):
def __init__(self):
super().__init__(_lib)
return _Runtime