|
| 1 | +# functional simulator specification |
| 2 | +-gpgpu_ptx_instruction_classification 0 |
| 3 | +-gpgpu_ptx_sim_mode 0 |
| 4 | +-gpgpu_ptx_force_max_capability 60 |
| 5 | + |
| 6 | +# SASS execution (only supported with CUDA >= 4.0) |
| 7 | +-gpgpu_ptx_convert_to_ptxplus 0 |
| 8 | +-gpgpu_ptx_save_converted_ptxplus 0 |
| 9 | + |
| 10 | +# high level architecture configuration |
| 11 | +-gpgpu_n_clusters 64 |
| 12 | +-gpgpu_n_cores_per_cluster 2 |
| 13 | +-gpgpu_n_mem 64 |
| 14 | +-gpgpu_n_sub_partition_per_mchannel 1 |
| 15 | + |
| 16 | +# Pscal clock domains |
| 17 | +#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock> |
| 18 | +# Pascal NVIDIA GP100 clock domains are adopted from |
| 19 | +# https://en.wikipedia.org/wiki/Nvidia_Tesla |
| 20 | +-gpgpu_clock_domains 1480.0:1480.0:1480.0:715.0 |
| 21 | + |
| 22 | +# shader core pipeline config |
| 23 | +-gpgpu_shader_registers 65536 |
| 24 | + |
| 25 | +# This implies a maximum of 64 warps/SM |
| 26 | +-gpgpu_shader_core_pipeline 2048:32 |
| 27 | +-gpgpu_shader_cta 32 |
| 28 | +-gpgpu_simd_model 1 |
| 29 | + |
| 30 | +# Pipeline widths and number of FUs |
| 31 | +# ID_OC_SP,ID_OC_DP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_SFU,OC_EX_MEM,EX_WB |
| 32 | +## Pascal GP100 has 2 SP SIMD units, 2 SFU units, 2 DP units per core |
| 33 | +## we need to scale the number of pipeline registers to be equal to the number of SP units |
| 34 | +-gpgpu_pipeline_widths 4,4,4,1,4,4,4,1,12 |
| 35 | +-gpgpu_num_sp_units 4 |
| 36 | +-gpgpu_num_sfu_units 4 |
| 37 | +-gpgpu_num_dp_units 4 |
| 38 | + |
| 39 | +# Instruction latencies and initiation intervals |
| 40 | +# "ADD,MAX,MUL,MAD,DIV" |
| 41 | +# All Div operations are executed on SFU unit |
| 42 | +# Throughput (initiation latency) are adopted from CUDA SDK document V8, section 5.4.1, Table 2 |
| 43 | +-ptx_opcode_latency_int 4,13,4,5,145 |
| 44 | +-ptx_opcode_initiation_int 1,1,1,1,4 |
| 45 | +-ptx_opcode_latency_fp 4,13,4,5,39 |
| 46 | +-ptx_opcode_initiation_fp 1,2,1,1,4 |
| 47 | +-ptx_opcode_latency_dp 8,19,8,8,330 |
| 48 | +-ptx_opcode_initiation_dp 2,2,2,2,130 |
| 49 | +-ptx_opcode_latency_sfu 8 |
| 50 | +-ptx_opcode_initiation_sfu 4 |
| 51 | + |
| 52 | + |
| 53 | +# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry> |
| 54 | +# ** Optional parameter - Required when mshr_type==Texture Fifo |
| 55 | +# Note: Hashing set index function (H) only applies to a set size of 32 or 64. |
| 56 | +# Pascal GP100 has 64KB Shared memory |
| 57 | +-gpgpu_cache:dl1 S:64:128:8,L:L:f:N:H,A:256:8,16:0,32 |
| 58 | +-gpgpu_cache:dl1PrefL1 S:64:128:16,L:L:f:N:H,A:256:8,16:0,32 |
| 59 | +-gpgpu_cache:dl1PrefShared S:32:128:6,L:L:f:N:H,A:256:8,16:0,32 |
| 60 | +-gpgpu_shmem_size 65536 |
| 61 | +-gpgpu_shmem_size_PrefL1 1 |
| 62 | +-gpgpu_shmem_size_PrefShared 98304 |
| 63 | +-gmem_skip_L1D 1 |
| 64 | +-icnt_flit_size 40 |
| 65 | +-gpgpu_n_cluster_ejection_buffer_size 32 |
| 66 | + |
| 67 | +# 32 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives 4MB L2 cache |
| 68 | +-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:L,A:256:4,32:0,32 |
| 69 | +-gpgpu_cache:dl2_texture_only 0 |
| 70 | +-gpgpu_dram_partition_queues 64:64:64:64 |
| 71 | +-perf_sim_memcpy 0 |
| 72 | +-memory_partition_indexing 0 |
| 73 | + |
| 74 | +# 4 KB Inst. |
| 75 | +-gpgpu_cache:il1 N:8:128:4,L:R:f:N:L,S:2:48,4 |
| 76 | +# 48 KB Tex |
| 77 | +-gpgpu_tex_cache:l1 N:16:128:24,L:R:m:N:L,F:128:4,128:2 |
| 78 | +# 12 KB Const |
| 79 | +-gpgpu_const_cache:l1 N:128:64:2,L:R:f:N:L,S:2:64,4 |
| 80 | + |
| 81 | +# enable operand collector |
| 82 | +-gpgpu_operand_collector_num_units_sp 12 |
| 83 | +-gpgpu_operand_collector_num_units_sfu 6 |
| 84 | +-gpgpu_operand_collector_num_units_mem 8 |
| 85 | +-gpgpu_operand_collector_num_units_dp 6 |
| 86 | +-gpgpu_operand_collector_num_in_ports_sp 4 |
| 87 | +-gpgpu_operand_collector_num_out_ports_sp 4 |
| 88 | +-gpgpu_operand_collector_num_in_ports_sfu 1 |
| 89 | +-gpgpu_operand_collector_num_out_ports_sfu 1 |
| 90 | +-gpgpu_operand_collector_num_in_ports_mem 1 |
| 91 | +-gpgpu_operand_collector_num_out_ports_mem 1 |
| 92 | +-gpgpu_operand_collector_num_in_ports_dp 1 |
| 93 | +-gpgpu_operand_collector_num_out_ports_dp 1 |
| 94 | +# gpgpu_num_reg_banks should be increased to 32 |
| 95 | +-gpgpu_num_reg_banks 32 |
| 96 | + |
| 97 | +# shared memory bankconflict detection |
| 98 | +-gpgpu_shmem_num_banks 32 |
| 99 | +-gpgpu_shmem_limited_broadcast 0 |
| 100 | +-gpgpu_shmem_warp_parts 1 |
| 101 | +-gpgpu_coalesce_arch 60 |
| 102 | + |
| 103 | +## In Pascal, a warp scheduler can issue 2 insts per cycle using 2 diff execution units |
| 104 | +-gpgpu_max_insn_issue_per_warp 2 |
| 105 | +-gpgpu_dual_issue_diff_exec_units 1 |
| 106 | + |
| 107 | +# interconnection |
| 108 | +-network_mode 1 |
| 109 | +-inter_config_file config_fermi_islip.icnt |
| 110 | + |
| 111 | +# memory partition latency config |
| 112 | +-rop_latency 120 |
| 113 | +# DRAM latency should be lower compared to other configs, due to high-speed interposer connection |
| 114 | +-dram_latency 100 |
| 115 | + |
| 116 | +# dram model config |
| 117 | +-gpgpu_dram_scheduler 1 |
| 118 | +# The DRAM return queue and the scheduler queue together should provide buffer |
| 119 | +# to sustain the memory level parallelism to tolerate DRAM latency |
| 120 | +# To allow 100% DRAM utility, there should at least be enough buffer to sustain |
| 121 | +# the minimum DRAM latency (100 core cycles). I.e. |
| 122 | +# Total buffer space required = 100 x 924MHz / 700MHz = 132 |
| 123 | +-gpgpu_frfcfs_dram_sched_queue_size 64 |
| 124 | +-gpgpu_dram_return_queue_size 192 |
| 125 | + |
| 126 | +# for HBM, 32 channles, each (128 bits) 16 bytes width |
| 127 | +-gpgpu_n_mem_per_ctrlr 1 |
| 128 | +-gpgpu_dram_buswidth 16 |
| 129 | +-gpgpu_dram_burst_length 2 |
| 130 | +-dram_data_command_freq_ratio 2 # HBM is DDR |
| 131 | +-gpgpu_mem_address_mask 1 |
| 132 | +-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBBCCC.CCCSSSSS |
| 133 | + |
| 134 | +# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf) |
| 135 | +# Timing for 1 GHZ |
| 136 | +# tRRDl and tWTR are missing, need to be added |
| 137 | +#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47: |
| 138 | +# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4" |
| 139 | + |
| 140 | +# Timing for 715 MHZ, Tesla Pascal P100 HBM runs at 715 MHZ |
| 141 | +-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=10:RAS=24:RP=10:RC=34: |
| 142 | + CL=10:WL=2:CDLR=3:WR=9:nbkgrp=4:CCDL=2:RTPL=3" |
| 143 | + |
| 144 | +# HBM has dual bus interface, in which it can issue two col and row commands at a time |
| 145 | +-dual_bus_interface 1 |
| 146 | +# select lower bits for bnkgrp to increase bnkgrp parallelism |
| 147 | +-dram_bnk_indexing_policy 0 |
| 148 | +-dram_bnkgrp_indexing_policy 1 |
| 149 | + |
| 150 | +#-Seperate_Write_Queue_Enable 1 |
| 151 | +#-Write_Queue_Size 64:56:32 |
| 152 | + |
| 153 | +# Pascal has two schedulers per core |
| 154 | +-gpgpu_num_sched_per_core 4 |
| 155 | +# Two Level Scheduler with active and pending pools |
| 156 | +#-gpgpu_scheduler two_level_active:6:0:1 |
| 157 | +# Loose round robbin scheduler |
| 158 | +#-gpgpu_scheduler lrr |
| 159 | +# Greedy then oldest scheduler |
| 160 | +-gpgpu_scheduler gto |
| 161 | + |
| 162 | +# stat collection |
| 163 | +-gpgpu_memlatency_stat 14 |
| 164 | +-gpgpu_runtime_stat 500 |
| 165 | +-enable_ptx_file_line_stats 1 |
| 166 | +-visualizer_enabled 0 |
| 167 | + |
| 168 | +# power model configs, disable it untill we create a real energy model for Pascal 100 |
| 169 | +-power_simulation_enabled 0 |
| 170 | +-gpuwattch_xml_file gpuwattch_gtx480.xml |
| 171 | + |
| 172 | +# tracing functionality |
| 173 | +#-trace_enabled 1 |
| 174 | +#-trace_components WARP_SCHEDULER,SCOREBOARD |
| 175 | +#-trace_sampling_core 0 |
| 176 | + |
| 177 | +# Multichip config |
| 178 | +#if you change the parition mapping, ensure you change the any_net file as well |
| 179 | +#if FT policy is used, then it is impo to use parition_mapping = 0 (i.e. consecutive) |
| 180 | +-multi_chip_mode 0 |
| 181 | +-n_gpu_chips 1 |
| 182 | +-mcm_partition_mapping 1 |
| 183 | +-mcm_vm_ft_policy 0 |
| 184 | +-mcm_vm_pagesize 2048 |
| 185 | +-mcm_coarse_grain_cta_sched 0 |
| 186 | +-mcm_cta_sched_grain 1 |
| 187 | +-cache_remote_data 0 |
| 188 | +-m_n_external 0 |
| 189 | +-cache_remote_only_once 0 |
| 190 | +-gpgpu_cache:dr2 S:64:128:8,L:E:m:N:L,A:256:4,32:0,32 |
| 191 | +-remote_cache_latency 0 |
| 192 | +-offchiplet_latency 0 |
| 193 | +-gpgpu_flush_l1_cache 1 |
0 commit comments