Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions python/kernel_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def compile_incore(
if incore_toolchain == ToolchainType.HOST_GXX_15:
return self._compile_incore_sim(
source_path,
core_type=core_type,
pto_isa_root=pto_isa_root,
extra_include_dirs=extra_include_dirs,
build_dir=build_dir,
Expand Down Expand Up @@ -467,6 +468,7 @@ def _compile_orchestration_shared_lib(
def _compile_incore_sim(
self,
source_path: str,
core_type: str = "aiv",
pto_isa_root: Optional[str] = None,
extra_include_dirs: Optional[List[str]] = None,
build_dir: Optional[str] = None,
Expand Down Expand Up @@ -499,6 +501,14 @@ def _compile_incore_sim(

# Build command from toolchain
cmd = [self.gxx15.cxx_path] + self.gxx15.get_compile_flags()
if sys.platform == "darwin":
cmd.extend(["-Wl,-undefined,dynamic_lookup"])
if core_type == "aic":
cmd.append("-D__DAV_CUBE__")
elif core_type == "aiv":
cmd.append("-D__DAV_VEC__")
else:
raise ValueError(f"Unknown core_type for simulation kernel compilation: {core_type}")

# Add PTO ISA header paths if provided
if pto_isa_root:
Expand Down
3 changes: 3 additions & 0 deletions src/a5/platform/onboard/aicore/inner_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,7 @@ __aicore__ __attribute__((always_inline)) inline uint64_t get_sys_cnt_aicore() {
return get_sys_cnt();
}

// CPU_SIM_SET_TASK_COOKIE - no-op on real hardware (simulation only)
#define CPU_SIM_SET_TASK_COOKIE(cookie) ((void)0)

#endif // PLATFORM_A5_AICORE_INNER_KERNEL_H_
4 changes: 4 additions & 0 deletions src/a5/platform/sim/aicore/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,7 @@ set_target_properties(aicore_kernel PROPERTIES
# This matches the hardcoded paths in Python/C++ code
SUFFIX ".so"
)

if(APPLE)
target_link_options(aicore_kernel PRIVATE -Wl,-undefined,dynamic_lookup)
endif()
15 changes: 15 additions & 0 deletions src/a5/platform/sim/aicore/inner_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,19 @@ inline uint32_t get_physical_core_id() {
return g_sim_physical_core_id;
}

// =============================================================================
// CPU Simulation Context APIs
// =============================================================================

// Set execution context for the current simulation thread (block/subblock identity).
// Called by kernel.cpp wrapper before aicore_execute().
extern "C" void pto_cpu_sim_set_execution_context(uint32_t block_idx, uint32_t subblock_id, uint32_t subblock_dim);

// Set task cookie for the current simulation thread.
// Called by aicore_executor before each task dispatch.
extern "C" void pto_cpu_sim_set_task_cookie(uint64_t task_cookie);

// CPU_SIM_SET_TASK_COOKIE - set task cookie in simulation, no-op on real hardware
#define CPU_SIM_SET_TASK_COOKIE(cookie) pto_cpu_sim_set_task_cookie(cookie)

#endif // PLATFORM_A5SIM_AICORE_INNER_KERNEL_H_
14 changes: 14 additions & 0 deletions src/a5/platform/sim/aicore/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,20 @@ extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, C
}

g_sim_physical_core_id = physical_core_id;
const uint32_t num_aic = static_cast<uint32_t>(runtime->worker_count / PLATFORM_CORES_PER_BLOCKDIM);
uint32_t cpu_block_idx = static_cast<uint32_t>(block_idx);
uint32_t subblock_id = 0;
uint32_t subblock_dim = 1;

if (core_type == CoreType::AIV && physical_core_id >= num_aic) {
const uint32_t aiv_offset = physical_core_id - num_aic;
cpu_block_idx = aiv_offset / PLATFORM_AIV_CORES_PER_BLOCKDIM;
subblock_id = aiv_offset % PLATFORM_AIV_CORES_PER_BLOCKDIM;
subblock_dim = PLATFORM_AIV_CORES_PER_BLOCKDIM;
} else {
cpu_block_idx = physical_core_id;
}

pto_cpu_sim_set_execution_context(cpu_block_idx, subblock_id, subblock_dim);
aicore_execute(runtime, block_idx, core_type);
}
1 change: 1 addition & 0 deletions src/a5/platform/sim/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ endif()
set(HOST_RUNTIME_SOURCES "")
list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/device_runner.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/cpu_sim_state.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
Expand Down
89 changes: 89 additions & 0 deletions src/a5/platform/sim/host/cpu_sim_state.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/**
* @file cpu_sim_state.cpp
* @brief CPU simulation state management for AICore execution context
*
* Provides thread-local execution context and shared storage APIs for
* the simulation environment. These extern "C" functions are called by
* AICore simulation code (via dlsym) to emulate hardware-provided
* block/subblock identity and cross-core shared state (e.g., VEC_FIFO).
*
* On real hardware these calls are compiled out via no-op macros in
* onboard/aicore/inner_kernel.h.
*/

#include <cstdint>
#include <cstdlib>
#include <cstddef>
#include <map>
#include <mutex>
#include <string>

#include "cpu_sim_state.h"

namespace {
thread_local uint32_t g_cpu_sim_block_idx = 0;
thread_local uint32_t g_cpu_sim_subblock_id = 0;
thread_local uint32_t g_cpu_sim_subblock_dim = 1;
thread_local uint64_t g_cpu_sim_task_cookie = 0;
std::mutex g_cpu_sim_shared_storage_mutex;
std::map<std::string, void*> g_cpu_sim_shared_storage;
} // namespace

void clear_cpu_sim_shared_storage()
{
std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
for (auto& [key, storage] : g_cpu_sim_shared_storage) {
(void)key;
std::free(storage);
}
g_cpu_sim_shared_storage.clear();
}

extern "C" void pto_cpu_sim_set_execution_context(uint32_t block_idx, uint32_t subblock_id,
uint32_t subblock_dim)
{
g_cpu_sim_block_idx = block_idx;
g_cpu_sim_subblock_id = subblock_id;
g_cpu_sim_subblock_dim = (subblock_dim == 0) ? 1u : subblock_dim;
}

extern "C" void pto_cpu_sim_set_task_cookie(uint64_t task_cookie)
{
g_cpu_sim_task_cookie = task_cookie;
}

extern "C" void pto_cpu_sim_get_execution_context(uint32_t* block_idx, uint32_t* subblock_id,
uint32_t* subblock_dim)
{
if (block_idx != nullptr) {
*block_idx = g_cpu_sim_block_idx;
}
if (subblock_id != nullptr) {
*subblock_id = g_cpu_sim_subblock_id;
}
if (subblock_dim != nullptr) {
*subblock_dim = g_cpu_sim_subblock_dim;
}
}

extern "C" uint64_t pto_cpu_sim_get_task_cookie()
{
return g_cpu_sim_task_cookie;
}

extern "C" void* pto_cpu_sim_get_shared_storage(const char* key, size_t size)
{
if (key == nullptr || size == 0) {
return nullptr;
}

std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
auto it = g_cpu_sim_shared_storage.find(key);
if (it != g_cpu_sim_shared_storage.end()) {
return it->second;
}

void* storage = std::calloc(1, size);
g_cpu_sim_shared_storage.emplace(key, storage);
return storage;
}
14 changes: 14 additions & 0 deletions src/a5/platform/sim/host/cpu_sim_state.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* @file cpu_sim_state.h
* @brief Internal header for CPU simulation state lifecycle management
*
* Declares clear_cpu_sim_shared_storage() for DeviceRunner to call at
* run() entry and finalize() to reset simulation state between runs.
*/

#ifndef PLATFORM_SIM_HOST_CPU_SIM_STATE_H_
#define PLATFORM_SIM_HOST_CPU_SIM_STATE_H_

void clear_cpu_sim_shared_storage();

#endif // PLATFORM_SIM_HOST_CPU_SIM_STATE_H_
5 changes: 4 additions & 1 deletion src/a5/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "device_runner.h"
#include "aicpu/platform_aicpu_affinity.h"
#include "host/raii_scope_guard.h"
#include "cpu_sim_state.h"

// Function pointer types for dynamically loaded executors
typedef int (*aicpu_execute_func_t)(Runtime* runtime);
Expand Down Expand Up @@ -139,6 +140,8 @@ int DeviceRunner::run(Runtime& runtime,
const std::vector<uint8_t>& aicore_kernel_binary,
int launch_aicpu_num) {

clear_cpu_sim_shared_storage();

// Validate launch_aicpu_num
if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) {
LOG_ERROR("launch_aicpu_num (%d) must be in range [1, %d]",
Expand Down Expand Up @@ -429,6 +432,7 @@ int DeviceRunner::finalize() {

// Free all remaining allocations
mem_alloc_.finalize();
clear_cpu_sim_shared_storage();

device_id_ = -1;
worker_count_ = 0;
Expand Down Expand Up @@ -547,4 +551,3 @@ void DeviceRunner::poll_and_collect_performance_data(int expected_tasks) {
int DeviceRunner::export_swimlane_json(const std::string& output_path) {
return perf_collector_.export_swimlane_json(output_path);
}

Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
uint64_t start_time = get_sys_cnt_aicore();

// Execute the task
CPU_SIM_SET_TASK_COOKIE(reinterpret_cast<uint64_t>(payload->args));
execute_task(payload);

// Performance profiling: record task execution
Expand Down
Loading
Loading