Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,13 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(
Arg args;
args.add_input(ext_A);
args.add_input(ext_B);
args.add_inout(C_view);
args.add_output(C_view);
args.add_input(ext_D);
args.add_input(ext_E);
args.add_inout(F_view);
args.add_output(F_view);
args.add_input(ext_G);
args.add_input(ext_H);
args.add_inout(I_view);
args.add_output(I_view);
pto2_rt_submit_task(mk, args);
}

Expand All @@ -117,7 +117,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(
Arg args;
args.add_input(ext_A);
args.add_input(ext_B);
args.add_inout(J_view);
args.add_output(J_view);
pto2_rt_submit_aic_task(FUNC_MATMUL, args);
}

Expand All @@ -126,7 +126,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(
Arg args;
args.add_input(ext_D);
args.add_input(ext_E);
args.add_inout(K_view);
args.add_output(K_view);
pto2_rt_submit_aiv_task(FUNC_ADD_STANDALONE, args);
}

Expand All @@ -138,10 +138,10 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(
Arg args;
args.add_input(ext_D);
args.add_input(ext_E);
args.add_inout(L_view);
args.add_output(L_view);
args.add_input(ext_G);
args.add_input(ext_H);
args.add_inout(M_view);
args.add_output(M_view);
pto2_rt_submit_task(mk, args);
}

Expand All @@ -153,10 +153,10 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(
Arg args;
args.add_input(ext_A);
args.add_input(ext_B);
args.add_inout(N_view);
args.add_output(N_view);
args.add_input(ext_D);
args.add_input(ext_E);
args.add_inout(O_view);
args.add_output(O_view);
pto2_rt_submit_task(mk, args);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(
Arg params_t4;
params_t4.add_input(g);
params_t4.add_input(c);
params_t4.add_inout(ext_f);
params_t4.add_output(ext_f);
pto2_rt_submit_aiv_task(0, params_t4); // kernel_add
} // inner scope ends: releases d, e, g
}
Expand Down
16 changes: 14 additions & 2 deletions python/runtime_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,23 @@ def _run_build_step(
logger.debug(result.stderr)

if result.returncode != 0:
logger.error(f"[{platform}] {step_name} failed: {result.stderr}")
raise RuntimeError(f"{step_name} failed for {platform}: {result.stderr}")
self._log_failed_build_output(platform, step_name, result)
raise RuntimeError(f"{step_name} failed for {platform} with exit code {result.returncode}")
except FileNotFoundError:
raise RuntimeError(f"{step_name} not found. Please install {step_name}.")

@staticmethod
def _log_failed_build_output(platform: str, step_name: str, result: subprocess.CompletedProcess) -> None:
"""Emit captured build output at ERROR level so failures are visible by default."""
logger.error(f"[{platform}] {step_name} failed with exit code {result.returncode}")

if result.stdout:
logger.error(f"[{platform}] {step_name} stdout:\n{result.stdout.rstrip()}")
if result.stderr:
logger.error(f"[{platform}] {step_name} stderr:\n{result.stderr.rstrip()}")
if not result.stdout and not result.stderr:
logger.error(f"[{platform}] {step_name} produced no stdout/stderr output")

def _run_compilation(
self,
cmake_source_dir: str,
Expand Down
53 changes: 43 additions & 10 deletions src/a2a3/runtime/tensormap_and_ringbuffer/runtime/common.h
Original file line number Diff line number Diff line change
@@ -1,42 +1,55 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/

#pragma once

#include <stdio.h>
#include <stdlib.h>

#include <stdexcept>
#include <string>

/**
* 获取当前调用栈信息(包含文件路径和行号)
* 实现在 common.cpp
* Get the current stack trace, including file paths and line numbers.
* Implemented in common.cpp.
*/
std::string get_stacktrace(int skip_frames = 1);

/**
* 断言失败异常,包含文件、行号、条件和调用栈信息
* Assertion failure exception with condition, file, line, and stack trace.
*/
class AssertionError : public std::runtime_error {
public:
public:
AssertionError(const char* condition, const char* file, int line);

const char* condition() const { return condition_; }
const char* file() const { return file_; }
int line() const { return line_; }

private:
private:
const char* condition_;
const char* file_;
int line_;
};

/**
* 断言失败时的处理函数
* 实现在 common.cpp
* Assertion failure handler.
* Implemented in common.cpp.
*/
[[noreturn]] void assert_impl(const char* condition, const char* file, int line);

/**
* debug_assert 宏 - 在 debug 模式下检查条件,失败时抛出异常并打印调用栈
* 在 release 模式 (NDEBUG) 下为空操作
* debug_assert macro:
* checks the condition in debug builds and throws with a stack trace on failure.
* It is a no-op in release builds (NDEBUG).
*/
#ifdef NDEBUG
#define debug_assert(cond) ((void)0)
Expand All @@ -50,11 +63,31 @@ class AssertionError : public std::runtime_error {
#endif

/**
* always_assert 宏 - 无论 debug 还是 release 模式都检查条件
* always_assert macro:
* checks the condition in both debug and release builds.
*/
#define always_assert(cond) \
do { \
if (!(cond)) { \
assert_impl(#cond, __FILE__, __LINE__); \
} \
} while (0)

#define PTO_PRAGMA(x) _Pragma(#x)

#if defined(__clang__)
#define MAYBE_UNINITIALIZED_BEGIN \
PTO_PRAGMA(clang diagnostic push) \
PTO_PRAGMA(clang diagnostic ignored "-Wuninitialized") \
PTO_PRAGMA(clang diagnostic ignored "-Wsometimes-uninitialized")
#define MAYBE_UNINITIALIZED_END PTO_PRAGMA(clang diagnostic pop)
#elif defined(__GNUC__)
#define MAYBE_UNINITIALIZED_BEGIN \
PTO_PRAGMA(GCC diagnostic push) \
PTO_PRAGMA(GCC diagnostic ignored "-Wuninitialized") \
PTO_PRAGMA(GCC diagnostic ignored "-Wmaybe-uninitialized")
#define MAYBE_UNINITIALIZED_END PTO_PRAGMA(GCC diagnostic pop)
#else
#define MAYBE_UNINITIALIZED_BEGIN
#define MAYBE_UNINITIALIZED_END
#endif
135 changes: 86 additions & 49 deletions src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,41 @@ static uint32_t g_orch_submit_idx = 0;
#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)
#endif

static bool pto2_append_fanin_or_fail(PTO2OrchestratorState* orch,
PTO2TaskId task_id,
int32_t tensor_arg_index,
TensorArgType ptype,
PTO2TaskSlotState* prod_state,
PTO2TaskSlotState* fanin_states[],
int32_t* fanin_count,
const char* reason) {
for (int32_t j = 0; j < *fanin_count; j++) {
if (fanin_states[j] == prod_state) {
return true;
}
}

if (*fanin_count >= PTO2_MAX_INPUTS) {
LOG_ERROR("========================================");
LOG_ERROR("FATAL: Dependency Overflow Detected!");
LOG_ERROR("========================================");
LOG_ERROR("Task requires more than PTO2_MAX_INPUTS unique fanin dependencies.");
LOG_ERROR(" task_id.raw: %" PRIu64, task_id.raw);
LOG_ERROR(" tensor_arg_index: %d", tensor_arg_index);
LOG_ERROR(" tensor_arg_type: %d", static_cast<int>(ptype));
LOG_ERROR(" fanin_count: %d / %d", *fanin_count, PTO2_MAX_INPUTS);
LOG_ERROR(" reason: %s", reason);
LOG_ERROR("This is a runtime dependency-tracking limit.");
LOG_ERROR("========================================");
orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_DEPENDENCY_OVERFLOW, std::memory_order_release);
orch->fatal = true;
return false;
}

fanin_states[(*fanin_count)++] = prod_state;
return true;
}

// =============================================================================
// Orchestrator Initialization
// =============================================================================
Expand Down Expand Up @@ -374,7 +409,7 @@ TaskOutputTensors pto2_submit_mixed_task(

int32_t local_id = alloc_result.task_id;
int32_t slot = alloc_result.slot;
PTO2TaskId task_id = pto2_make_task_id(ring_id, static_cast<uint32_t>(local_id));
PTO2TaskId task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(local_id));

PTO2TaskDescriptor& task = allocator.task_by_slot(slot);
PTO2TaskPayload* payload = &orch->sm_handle->task_payloads[ring_id][slot];
Expand Down Expand Up @@ -443,47 +478,48 @@ TaskOutputTensors pto2_submit_mixed_task(
// === STEP 3: Lookup inputs + materialize runtime-created outputs ===
for (int i = 0; i < args.tensor_count(); i++) {
TensorArgType ptype = args.tag(i);
if (ptype == TensorArgType::OUTPUT) {
// Runtime-created OUTPUT tensors are not looked up in the TensorMap since they have no dependencies.
continue;
}

switch (ptype) {
case TensorArgType::INOUT:
case TensorArgType::INPUT: {
if (args.tensor(i).ptr->manual_dep) break;
// Look up producer via TensorMap (reads from cached stack tensor)
PTO2LookupResult lookup_result;
orch->tensor_map.lookup(*args.tensor(i).ptr, lookup_result);

for (int r = 0; r < lookup_result.count; r++) {
PTO2TensorMapEntry& entry = *lookup_result.entries[r].entry;
auto overlap_status = lookup_result.entries[r].overlap_status;
// Check if this producer is already in fanin list (avoid duplicates)
auto prod_ring = entry.producer_task_id.ring();
auto prod_local = entry.producer_task_id.local();
PTO2TaskSlotState* prod_state =
&sched->ring_sched_states[prod_ring].get_slot_state_by_task_id(prod_local);
bool already_added = false;
for (int j = 0; j < fanin_count; j++) {
if (fanin_states[j] == prod_state) {
already_added = true;
break;
}
}

if (!already_added) {
// Add to fanin list (this task depends on producer)
if (fanin_count < PTO2_MAX_INPUTS) {
fanin_states[fanin_count++] = prod_state;
}
}
if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
if (!entry.with_alloc) {
orch->tensor_map.remove_entry(entry);
}
}
}
break;
const Tensor* tensor = args.tensor(i).ptr;

// Step A: creator retention — all existing tensors extend their creator lifetime.
PTO2TaskId owner = tensor->owner_task_id;
if (owner.is_valid() && sched != nullptr) {
PTO2TaskSlotState* prod_state =
&sched->ring_sched_states[owner.ring()].get_slot_state_by_task_id(owner.local());
if (!pto2_append_fanin_or_fail(
orch, task_id, i, ptype, prod_state, fanin_states, &fanin_count, "creator retention")) {
return result;
}
}

// Step B: only INPUT/INOUT need modifier dependency lookup.
if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
continue;
}
if (tensor->manual_dep) {
continue;
}

PTO2LookupResult lookup_result;
orch->tensor_map.lookup(*tensor, lookup_result);

for (int r = 0; r < lookup_result.count; r++) {
PTO2TensorMapEntry& entry = *lookup_result.entries[r].entry;
auto overlap_status = lookup_result.entries[r].overlap_status;
auto prod_ring = entry.producer_task_id.ring();
auto prod_local = entry.producer_task_id.local();
PTO2TaskSlotState* prod_state = &sched->ring_sched_states[prod_ring].get_slot_state_by_task_id(prod_local);
if (!pto2_append_fanin_or_fail(
orch, task_id, i, ptype, prod_state, fanin_states, &fanin_count, "overlap lookup")) {
return result;
}
if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
orch->tensor_map.remove_entry(entry);
}
default:
break;
}
}

Expand All @@ -493,16 +529,9 @@ TaskOutputTensors pto2_submit_mixed_task(
{
for (int i = 0; i < args.tensor_count(); i++) {
TensorArgType ptype = args.tag(i);
if (ptype == TensorArgType::INOUT) {
if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
if (!args.tensor(i).ptr->manual_dep) {
orch->tensor_map.insert(*args.tensor(i).ptr, task_id, false);
}
} else if (ptype == TensorArgType::OUTPUT) {
if (!args.tensor(i).create_info->manual_dep) {
orch->tensor_map.insert(*args.tensor(i).create_info,
reinterpret_cast<void*>(reinterpret_cast<char*>(alloc_result.packed_base) + offsets[i]),
task_id,
true);
orch->tensor_map.insert(*args.tensor(i).ptr, task_id);
}
}
}
Expand Down Expand Up @@ -533,6 +562,14 @@ TaskOutputTensors pto2_submit_mixed_task(

payload->init(args, result, alloc_result.packed_base, offsets, buffer_sizes);

// Write owner_task_id into materialized OUTPUT tensors so creator-only dependency
// tracking remains available even when manual_dep skips OverlapMap publication.
for (int i = 0; i < args.tensor_count(); i++) {
if (args.tag(i) == TensorArgType::OUTPUT) {
payload->tensors[i].owner_task_id = task_id;
}
}

CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw);
#if PTO2_ORCH_PROFILING
g_orch_args_atomic_count += 2; // fanout_lock.store + fanout_count.store
Expand Down
Loading
Loading