cuCollections/benchmarks/synchronization.hpp at edb0bfda0322b3595e56db9db05427401bed732e · PointKernel/cuCollections · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/*
 * Copyright (c) 2020-2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

// Google Benchmark library
#include <benchmark/benchmark.h>

#include <cuda_runtime_api.h>

#define BENCH_CUDA_TRY(call)                                                         \
  do {                                                                               \
    auto const status = (call);                                                      \
    if (cudaSuccess != status) { throw std::runtime_error("CUDA error detected."); } \
  } while (0)

#define BENCH_ASSERT_CUDA_SUCCESS(expr) \
  do {                                  \
    cudaError_t const status = (expr);  \
    assert(cudaSuccess == status);      \
  } while (0)
/**
 * @brief  This class serves as a wrapper for using `cudaEvent_t` as the user
 * defined timer within the framework of google benchmark
 * (https://github.com/google/benchmark).
 *
 * It is built on top of the idea of Resource acquisition is initialization
 * (RAII). In the following we show a minimal example of how to use this class.
 *
 * \code{cpp}
 * #include <benchmark/benchmark.h>
 *
 * static void sample_cuda_benchmark(benchmark::State& state) {
 *
 *   for (auto _ : state){
 *     cudaStream_t stream = 0;
 *
 *     // Create (Construct) an object of this class. You HAVE to pass in the
 *     // benchmark::State object you are using. It measures the time from its
 *     // creation to its destruction that is spent on the specified CUDA stream.
 *     // It also clears the L2 cache by cudaMemset'ing a device buffer that is of
 *     // the size of the L2 cache (if flush_l2_cache is set to true and there is
 *     // an L2 cache on the current device).
 *     cuda_event_timer raii(state, true, stream); // flush_l2_cache = true
 *
 *     // Now perform the operations that is to be benchmarked
 *     sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel
 *
 *   }
 * }
 *
 * // Register the function as a benchmark. You will need to set the `UseManualTime()`
 * // flag in order to use the timer embeded in this class.
 * BENCHMARK(sample_cuda_benchmark)->UseManualTime();
 * \endcode
 *
 *
 */
class cuda_event_timer {
 public:
  /**
   * @brief Constructs a `cuda_event_timer` beginning a manual timing range.
   *
   * Optionally flushes L2 cache.
   *
   * @param[in,out] state  This is the benchmark::State whose timer we are going
   * to update.
   * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before
   *                            every iteration.
   * @param[in] stream_ The CUDA stream we are measuring time on.
   */
  cuda_event_timer(benchmark::State& state, bool flush_l2_cache = false, cudaStream_t stream = 0)
    : p_state(&state), stream_(stream)
  {
    // flush all of L2$
    if (flush_l2_cache) {
      int current_device = 0;
      BENCH_CUDA_TRY(cudaGetDevice(&current_device));

      int l2_cache_bytes = 0;
      BENCH_CUDA_TRY(
        cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

      if (l2_cache_bytes > 0) {
        const int memset_value = 0;
        int* l2_cache_buffer   = nullptr;
        BENCH_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
        BENCH_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
        BENCH_CUDA_TRY(cudaFree(l2_cache_buffer));
      }
    }

    BENCH_CUDA_TRY(cudaEventCreate(&start_));
    BENCH_CUDA_TRY(cudaEventCreate(&stop_));
    BENCH_CUDA_TRY(cudaEventRecord(start_, stream_));
  }

  cuda_event_timer() = delete;

  /**
   * @brief Destroy the `cuda_event_timer` and ending the manual time range.
   *
   */
  ~cuda_event_timer()
  {
    BENCH_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
    BENCH_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
    float milliseconds = 0.0f;
    BENCH_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
    p_state->SetIterationTime(milliseconds / (1000.0f));
    BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
    BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
  }

 private:
  cudaEvent_t start_;
  cudaEvent_t stop_;
  cudaStream_t stream_;
  benchmark::State* p_state;
};