diff --git a/util/tracer_nvbit/others/cuda_api_tracer_tool/Makefile b/util/tracer_nvbit/others/cuda_api_tracer_tool/Makefile new file mode 100644 index 000000000..adca4b872 --- /dev/null +++ b/util/tracer_nvbit/others/cuda_api_tracer_tool/Makefile @@ -0,0 +1,12 @@ +SUB_DIRS = $(wildcard */.) +SUB_DIRS_ALL = $(SUB_DIRS:%=all-%) +SUB_DIRS_CLEAN = $(SUB_DIRS:%=clean-%) + +all: $(SUB_DIRS_ALL) +clean: $(SUB_DIRS_CLEAN) + +$(SUB_DIRS_ALL): + $(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%) + +$(SUB_DIRS_CLEAN): + $(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean \ No newline at end of file diff --git a/util/tracer_nvbit/others/cuda_api_tracer_tool/cuda_api_tracer/Makefile b/util/tracer_nvbit/others/cuda_api_tracer_tool/cuda_api_tracer/Makefile new file mode 100644 index 000000000..443c1e017 --- /dev/null +++ b/util/tracer_nvbit/others/cuda_api_tracer_tool/cuda_api_tracer/Makefile @@ -0,0 +1,36 @@ +NVCC=nvcc -ccbin=$(CXX) -D_FORCE_INLINES + +NVCC_VER_REQ=10.1 +NVCC_VER=$(shell $(NVCC) --version | grep release | cut -f2 -d, | cut -f3 -d' ') +NVCC_VER_CHECK=$(shell echo "${NVCC_VER} >= $(NVCC_VER_REQ)" | bc) + +ifeq ($(NVCC_VER_CHECK),0) +$(error ERROR: nvcc version >= $(NVCC_VER_REQ) required to compile an nvbit tool! Instrumented applications can still use lower versions of nvcc.) +endif + +NVBIT_PATH=../../../nvbit_release/core +INCLUDES=-I$(NVBIT_PATH) + +LIBS=-L$(NVBIT_PATH) -lnvbit +NVCC_PATH=-L $(subst bin/nvcc,lib64,$(shell which nvcc | tr -s /)) + +SOURCES=$(wildcard *.cu) + +OBJECTS=$(SOURCES:.cu=.o) +ARCH?=35 + +mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) +current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path)))) + +NVBIT_TOOL=$(current_dir).so + +all: $(NVBIT_TOOL) + +$(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a + $(NVCC) -arch=sm_$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@ + +%.o: %.cu + $(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=sm_$(ARCH) -O3 -Xcompiler -fPIC $< -o $@ + +clean: + rm -f *.so *.o diff --git a/util/tracer_nvbit/others/cuda_api_tracer_tool/cuda_api_tracer/cuda_api_tracer.cu b/util/tracer_nvbit/others/cuda_api_tracer_tool/cuda_api_tracer/cuda_api_tracer.cu new file mode 100644 index 000000000..676eac70d --- /dev/null +++ b/util/tracer_nvbit/others/cuda_api_tracer_tool/cuda_api_tracer/cuda_api_tracer.cu @@ -0,0 +1,332 @@ +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* every tool needs to include this once */ +#include "nvbit_tool.h" + +/* nvbit interface file */ +#include "nvbit.h" + +/* nvbit utility functions */ +#include "utils/utils.h" + +/* kernel id counter, maintained in system memory */ +uint32_t kernel_id = 0; + +/* total instruction counter, maintained in system memory, incremented by + * "counter" every time a kernel completes */ +uint64_t tot_app_instrs = 0; + +/* counter to memcpy calls */ +uint64_t tot_memcpy_h2d = 0; +uint64_t tot_memcpy_d2h = 0; + +/* Trace file pointer */ +FILE * traceFp; + +/* Hashmap to keep device pointer reference */ +std::map *dptr_map; + +/* kernel instruction counter, updated by the GPU */ +__managed__ uint64_t counter = 0; + +/* global control variables for this tool */ +uint32_t instr_begin_interval = 0; +uint32_t instr_end_interval = UINT32_MAX; +uint32_t start_grid_num = 0; +uint32_t end_grid_num = UINT32_MAX; +int verbose = 0; +int count_warp_level = 1; +int exclude_pred_off = 0; +int active_from_start = 1; +bool mangled = false; + +/* used to select region of insterest when active from start is off */ +bool active_region = true; + +/* a pthread mutex, used to prevent multiple kernels to run concurrently and + * therefore to "corrupt" the counter variable */ +pthread_mutex_t mutex; + +/* nvbit_at_init() is executed as soon as the nvbit tool is loaded. We typically + * do initializations in this call. In this case for instance we get some + * environment variables values which we use as input arguments to the tool */ +void nvbit_at_init() { + /* just make sure all managed variables are allocated on GPU */ + setenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC", "1", 1); + + /* we get some environment variables that are going to be use to selectively + * instrument (within a interval of kernel indexes and instructions). By + * default we instrument everything. */ + GET_VAR_INT( + instr_begin_interval, "INSTR_BEGIN", 0, + "Beginning of the instruction interval where to apply instrumentation"); + GET_VAR_INT( + instr_end_interval, "INSTR_END", UINT32_MAX, + "End of the instruction interval where to apply instrumentation"); + GET_VAR_INT(start_grid_num, "START_GRID_NUM", 0, + "Beginning of the kernel gird launch interval where to apply " + "instrumentation"); + GET_VAR_INT( + end_grid_num, "END_GRID_NUM", UINT32_MAX, + "End of the kernel launch interval where to apply instrumentation"); + GET_VAR_INT(count_warp_level, "COUNT_WARP_LEVEL", 1, + "Count warp level or thread level instructions"); + GET_VAR_INT(exclude_pred_off, "EXCLUDE_PRED_OFF", 0, + "Exclude predicated off instruction from count"); + GET_VAR_INT( + active_from_start, "ACTIVE_FROM_START", 1, + "Start instruction counting from start or wait for cuProfilerStart " + "and cuProfilerStop"); + GET_VAR_INT(mangled, "MANGLED_NAMES", 1, + "Print kernel names mangled or not"); + + GET_VAR_INT(verbose, "TOOL_VERBOSE", 0, "Enable verbosity inside the tool"); + if (active_from_start == 0) { + active_region = false; + } + + // Init trace fp + traceFp = fopen("cuda_calls.trace", "w"); + + // Init map + dptr_map = new std::map(); + + std::string pad(100, '-'); + printf("%s\n", pad.c_str()); + +} + +/* This call-back is triggered every time a CUDA driver call is encountered. + * Here we can look for a particular CUDA driver call by checking at the + * call back ids which are defined in tools_cuda_api_meta.h. + * This call back is triggered bith at entry and at exit of each CUDA driver + * call, is_exit=0 is entry, is_exit=1 is exit. + * */ +// TODO: Balar runs on api calls, will this cause issues? +void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, + const char *name, void *params, CUresult *pStatus) { + /* Identify all the possible CUDA launch events */ + if (is_exit) { + // Check memalloc result after it finishes + // Deref as the argument is passed by addr + if (cbid == API_CUDA_cuMemAlloc || cbid == API_CUDA_cuMemAlloc_v2) { + // TODO Use address instead of value to keep reference? + cuMemAlloc_v2_params *p = (cuMemAlloc_v2_params *)params; + uint64_t size = dptr_map->size(); + + // Assuming there will not be 10^24 device pointer exist + char *name = (char*)malloc(30); + sprintf(name, "dptr-%d", size); + dptr_map->insert({*(p->dptr), name}); + + fprintf(traceFp, "CUDA memalloc: dptr: %s, size: %d\n", name, p->bytesize); + } else if (cbid == API_CUDA_cuMemcpyDtoH + || cbid == API_CUDA_cuMemcpyDtoHAsync + || cbid == API_CUDA_cuMemcpyDtoH_v2 + || cbid == API_CUDA_cuMemcpyDtoHAsync_v2 + || cbid == API_CUDA_cuMemcpyDtoH_v2_ptds + || cbid == API_CUDA_cuMemcpyDtoHAsync_v2_ptsz) { + // Move to here as we need to save the data for verification purpose + cuMemcpyDtoH_v2_params *p = (cuMemcpyDtoH_v2_params *) params; + CUdeviceptr dptr = p->srcDevice; + char* name = dptr_map->find(dptr)->second; + + // Dump src data (or dst here as we finished copy) to a file with name: cuMemcpyD2H-COUNT-SIZE.data + char buf[200]; + sprintf(buf, "cuMemcpyD2H-%d-%d.data", tot_memcpy_d2h, p->ByteCount); + std::string filename(buf); + std::ofstream out(filename); + if (out.is_open()) { + out.write((const char *)p->dstHost, p->ByteCount); + out.close(); + } else { + std::cerr << "Cannot open " << filename << std::endl; + } + + tot_memcpy_d2h++; + + fprintf(traceFp, "CUDA memcpyD2H detected: host_ptr: %p, device_ptr: %s, size: %ld, data_file: %s\n", p->dstHost, name, p->ByteCount, buf); + } + return; + } + + if (cbid == API_CUDA_cuLaunchKernel_ptsz || cbid == API_CUDA_cuLaunchKernel) { + /* cast params to cuLaunchKernel_params since if we are here we know these are + * the right parameters type */ + + cuLaunchKernel_params *p = (cuLaunchKernel_params *)params; + std::string funcName(nvbit_get_func_name(ctx, p->f, false)); + std::string funcNamePTX(nvbit_get_func_name(ctx, p->f, true)); + std::size_t start = funcName.find("("); + std::size_t end = funcName.find(")"); + std::string funcTypes = funcName.substr(start + 1, end - start - 1); + + fprintf(traceFp, "CUDA cu kernel launch detected: name: %s, ptx_name: %s, " + "funcptr: %p, gdx: %d, gdy: %d, gdz: %d, " + "bdx: %d, bdy: %d, bdz: %d, sharedBytes: %d, " + "CUstream: %p, args: ", + funcName.substr(0, start).c_str(), + funcNamePTX.c_str(), + p->f, p->gridDimX, p->gridDimY, p->gridDimZ, + p->blockDimX, p->blockDimY, p->blockDimZ, + p->sharedMemBytes, p->hStream); + + // TODO: Need to get kernel param count and size and offset information + // Can use nvbit_get_kernel_argument_sizes (nvbit 1.5.5 has problem with this) + // Can also use nvbit_get_func_name + // Currently parse the function signature instead, + // will have some compactibility issues + void** tmp = (p->kernelParams); + + // Parse each argument type + while (funcTypes.size() > 0) { + std::string type; + std::string delim(", "); + std::size_t firstSplit = funcTypes.find(delim); + if (firstSplit == std::string::npos) { + // Reach last argument + type = funcTypes; + funcTypes.clear(); + } else { + type = funcTypes.substr(0, firstSplit); + funcTypes = funcTypes.substr(firstSplit + delim.length()); + } + + // Parse on type and give type size + if (type.compare("double*") == 0) { + CUdeviceptr dptr = (CUdeviceptr) *(double **)(*tmp); + char* name = dptr_map->find(dptr)->second; + fprintf(traceFp, "%s/%d/", name, sizeof(double*)); + } else if (type.compare("float*") == 0) { + CUdeviceptr dptr = (CUdeviceptr) *(float **)(*tmp); + char* name = dptr_map->find(dptr)->second; + fprintf(traceFp, "%s/%d/", name, sizeof(float*)); + } else if (type.compare("int*") == 0) { + CUdeviceptr dptr = (CUdeviceptr) *(int **)(*tmp); + char* name = dptr_map->find(dptr)->second; + fprintf(traceFp, "%s/%d/", name, sizeof(int*)); + } else if (type.compare("double") == 0) { + fprintf(traceFp, "%f/%d/", *(double *)(*tmp), sizeof(double)); + } else if (type.compare("float") == 0) { + fprintf(traceFp, "%f/%d/", *(float *)(*tmp), sizeof(float)); + } else if (type.compare("int") == 0) { + fprintf(traceFp, "%d/%d/", *(int *)(*tmp), sizeof(int)); + } else { + // Waiting on NVBit to recognize argument types + // but you could add your own data types here to parse the + // function signature + assert(0 && "Waiting on NVBit 1.5.6 release to recognize argument sizes automatically"); + } + + // Increment the argument pointer + tmp++; + } + + + // TODO: Cannot use this until next issue + // TODO: See: https://github.com/NVlabs/NVBit/issues/80 + /** + std::vector sizes = nvbit_get_kernel_argument_sizes(p->f); + void** tmp = (p->kernelParams); + + for (auto it = sizes.begin(); it < sizes.end(); it++) { + int size = *it; + if (size == 1 << 0) { + printf("%d, ", *(uint8_t *)(*tmp)); + } else if (size == 1 << 1) { + printf("%d, ", *(uint16_t *)(*tmp)); + } else if (size == 1 << 2) { + printf("%d, ", *(uint32_t *)(*tmp)); + } else if (size == 1 << 3) { + printf("%p, ", *(uint64_t *)(*tmp)); + } else { + printf("Unknown type, "); + } + tmp++; + }*/ + + // // TODO: Hard coded for testing + // printf("%p, ", *((double **)(*tmp))); + // tmp++; + // printf("%p, ", *((double **)(*tmp))); + // tmp++; + // printf("%p, ", *((double **)(*tmp))); + // tmp++; + // printf("%d", *((int *)(*tmp))); + + fprintf(traceFp, "\n"); + + } else if (cbid == API_CUDA_cuMemcpyHtoD + || cbid == API_CUDA_cu64MemcpyHtoD + || cbid == API_CUDA_cuMemcpyHtoDAsync + || cbid == API_CUDA_cu64MemcpyHtoDAsync + || cbid == API_CUDA_cuMemcpyHtoD_v2 + || cbid == API_CUDA_cuMemcpyHtoDAsync_v2 + || cbid == API_CUDA_cuMemcpyHtoD_v2_ptds + || cbid == API_CUDA_cuMemcpyHtoDAsync_v2_ptsz) { + cuMemcpyHtoD_v2_params *p = (cuMemcpyHtoD_v2_params *) params; + CUdeviceptr dptr = p->dstDevice; + char* name = dptr_map->find(dptr)->second; + + // Dump src data to a file with name: cuMemcpyH2D-COUNT-SIZE.data + char buf[200]; + sprintf(buf, "cuMemcpyH2D-%d-%d.data", tot_memcpy_h2d, p->ByteCount); + std::string filename(buf); + std::ofstream out(filename); + if (out.is_open()) { + out.write((const char *)p->srcHost, p->ByteCount); + out.close(); + } else { + std::cerr << "Cannot open " << filename << std::endl; + } + + tot_memcpy_h2d++; + + fprintf(traceFp, "CUDA memcpyH2D detected: device_ptr: %s, host_ptr: %p, size: %d, data_file: %s\n", name, p->srcHost, p->ByteCount, buf); + } else if (cbid == API_CUDA_cuMemFree || cbid == API_CUDA_cuMemFree_v2) { + cuMemFree_v2_params *p = (cuMemFree_v2_params *) params; + CUdeviceptr dptr = p->dptr; + char* name = dptr_map->find(dptr)->second; + fprintf(traceFp, "CUDA free detected: dptr: %s\n", name); + } +} + +void nvbit_at_term() { + printf("Finishes cuda api call tracing\n"); + fclose(traceFp); +}