Skip to content

Commit 18e5cac

Browse files
committed
Added V100 mcm configs
1 parent 2260456 commit 18e5cac

File tree

237 files changed

+27162
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

237 files changed

+27162
-0
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//21*1 fly with 32 flits per packet under gpgpusim injection mode
2+
use_map = 0;
3+
flit_size = 40;
4+
5+
// currently we do not use this, see subnets below
6+
network_count = 2;
7+
8+
// Topology
9+
topology = fly;
10+
k = 128;
11+
n = 1;
12+
13+
// Routing
14+
15+
routing_function = dest_tag;
16+
17+
18+
// Flow control
19+
20+
num_vcs = 1;
21+
vc_buf_size = 128;
22+
input_buffer_size = 256;
23+
ejection_buffer_size = 128;
24+
boundary_buffer_size = 128;
25+
26+
wait_for_tail_credit = 0;
27+
28+
// Router architecture
29+
30+
vc_allocator = islip; //separable_input_first;
31+
sw_allocator = islip; //separable_input_first;
32+
alloc_iters = 1;
33+
34+
credit_delay = 0;
35+
routing_delay = 0;
36+
vc_alloc_delay = 1;
37+
sw_alloc_delay = 1;
38+
39+
input_speedup = 2;
40+
output_speedup = 1;
41+
internal_speedup = 2.0;
42+
43+
// Traffic, GPGPU-Sim does not use this
44+
45+
traffic = uniform;
46+
packet_size ={{1,2,3,4},{10,20}};
47+
packet_size_rate={{1,1,1,1},{2,1}};
48+
49+
// Simulation - Don't change
50+
51+
sim_type = gpgpusim;
52+
//sim_type = latency;
53+
injection_rate = 0.1;
54+
55+
subnets = 2;
56+
57+
// Always use read and write no matter following line
58+
//use_read_write = 1;
59+
60+
61+
read_request_subnet = 0;
62+
read_reply_subnet = 1;
63+
write_request_subnet = 0;
64+
write_reply_subnet = 1;
65+
66+
read_request_begin_vc = 0;
67+
read_request_end_vc = 0;
68+
write_request_begin_vc = 0;
69+
write_request_end_vc = 0;
70+
read_reply_begin_vc = 0;
71+
read_reply_end_vc = 0;
72+
write_reply_begin_vc = 0;
73+
write_reply_end_vc = 0;
74+
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# functional simulator specification
2+
-gpgpu_ptx_instruction_classification 0
3+
-gpgpu_ptx_sim_mode 0
4+
-gpgpu_ptx_force_max_capability 60
5+
6+
# SASS execution (only supported with CUDA >= 4.0)
7+
-gpgpu_ptx_convert_to_ptxplus 0
8+
-gpgpu_ptx_save_converted_ptxplus 0
9+
10+
# high level architecture configuration
11+
-gpgpu_n_clusters 64
12+
-gpgpu_n_cores_per_cluster 2
13+
-gpgpu_n_mem 64
14+
-gpgpu_n_sub_partition_per_mchannel 1
15+
16+
# Pscal clock domains
17+
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
18+
# Pascal NVIDIA GP100 clock domains are adopted from
19+
# https://en.wikipedia.org/wiki/Nvidia_Tesla
20+
-gpgpu_clock_domains 1480.0:1480.0:1480.0:715.0
21+
22+
# shader core pipeline config
23+
-gpgpu_shader_registers 65536
24+
25+
# This implies a maximum of 64 warps/SM
26+
-gpgpu_shader_core_pipeline 2048:32
27+
-gpgpu_shader_cta 32
28+
-gpgpu_simd_model 1
29+
30+
# Pipeline widths and number of FUs
31+
# ID_OC_SP,ID_OC_DP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_SFU,OC_EX_MEM,EX_WB
32+
## Pascal GP100 has 2 SP SIMD units, 2 SFU units, 2 DP units per core
33+
## we need to scale the number of pipeline registers to be equal to the number of SP units
34+
-gpgpu_pipeline_widths 4,4,4,1,4,4,4,1,12
35+
-gpgpu_num_sp_units 4
36+
-gpgpu_num_sfu_units 4
37+
-gpgpu_num_dp_units 4
38+
39+
# Instruction latencies and initiation intervals
40+
# "ADD,MAX,MUL,MAD,DIV"
41+
# All Div operations are executed on SFU unit
42+
# Throughput (initiation latency) are adopted from CUDA SDK document V8, section 5.4.1, Table 2
43+
-ptx_opcode_latency_int 4,13,4,5,145
44+
-ptx_opcode_initiation_int 1,1,1,1,4
45+
-ptx_opcode_latency_fp 4,13,4,5,39
46+
-ptx_opcode_initiation_fp 1,2,1,1,4
47+
-ptx_opcode_latency_dp 8,19,8,8,330
48+
-ptx_opcode_initiation_dp 2,2,2,2,130
49+
-ptx_opcode_latency_sfu 8
50+
-ptx_opcode_initiation_sfu 4
51+
52+
53+
# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
54+
# ** Optional parameter - Required when mshr_type==Texture Fifo
55+
# Note: Hashing set index function (H) only applies to a set size of 32 or 64.
56+
# Pascal GP100 has 64KB Shared memory
57+
-gpgpu_cache:dl1 S:64:128:8,L:L:f:N:H,A:256:8,16:0,32
58+
-gpgpu_cache:dl1PrefL1 S:64:128:16,L:L:f:N:H,A:256:8,16:0,32
59+
-gpgpu_cache:dl1PrefShared S:32:128:6,L:L:f:N:H,A:256:8,16:0,32
60+
-gpgpu_shmem_size 65536
61+
-gpgpu_shmem_size_PrefL1 1
62+
-gpgpu_shmem_size_PrefShared 98304
63+
-gmem_skip_L1D 1
64+
-icnt_flit_size 40
65+
-gpgpu_n_cluster_ejection_buffer_size 32
66+
67+
# 32 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives 4MB L2 cache
68+
-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:L,A:256:4,32:0,32
69+
-gpgpu_cache:dl2_texture_only 0
70+
-gpgpu_dram_partition_queues 64:64:64:64
71+
-perf_sim_memcpy 0
72+
-memory_partition_indexing 0
73+
74+
# 4 KB Inst.
75+
-gpgpu_cache:il1 N:8:128:4,L:R:f:N:L,S:2:48,4
76+
# 48 KB Tex
77+
-gpgpu_tex_cache:l1 N:16:128:24,L:R:m:N:L,F:128:4,128:2
78+
# 12 KB Const
79+
-gpgpu_const_cache:l1 N:128:64:2,L:R:f:N:L,S:2:64,4
80+
81+
# enable operand collector
82+
-gpgpu_operand_collector_num_units_sp 12
83+
-gpgpu_operand_collector_num_units_sfu 6
84+
-gpgpu_operand_collector_num_units_mem 8
85+
-gpgpu_operand_collector_num_units_dp 6
86+
-gpgpu_operand_collector_num_in_ports_sp 4
87+
-gpgpu_operand_collector_num_out_ports_sp 4
88+
-gpgpu_operand_collector_num_in_ports_sfu 1
89+
-gpgpu_operand_collector_num_out_ports_sfu 1
90+
-gpgpu_operand_collector_num_in_ports_mem 1
91+
-gpgpu_operand_collector_num_out_ports_mem 1
92+
-gpgpu_operand_collector_num_in_ports_dp 1
93+
-gpgpu_operand_collector_num_out_ports_dp 1
94+
# gpgpu_num_reg_banks should be increased to 32
95+
-gpgpu_num_reg_banks 32
96+
97+
# shared memory bankconflict detection
98+
-gpgpu_shmem_num_banks 32
99+
-gpgpu_shmem_limited_broadcast 0
100+
-gpgpu_shmem_warp_parts 1
101+
-gpgpu_coalesce_arch 60
102+
103+
## In Pascal, a warp scheduler can issue 2 insts per cycle using 2 diff execution units
104+
-gpgpu_max_insn_issue_per_warp 2
105+
-gpgpu_dual_issue_diff_exec_units 1
106+
107+
# interconnection
108+
-network_mode 1
109+
-inter_config_file config_fermi_islip.icnt
110+
111+
# memory partition latency config
112+
-rop_latency 120
113+
# DRAM latency should be lower compared to other configs, due to high-speed interposer connection
114+
-dram_latency 100
115+
116+
# dram model config
117+
-gpgpu_dram_scheduler 1
118+
# The DRAM return queue and the scheduler queue together should provide buffer
119+
# to sustain the memory level parallelism to tolerate DRAM latency
120+
# To allow 100% DRAM utility, there should at least be enough buffer to sustain
121+
# the minimum DRAM latency (100 core cycles). I.e.
122+
# Total buffer space required = 100 x 924MHz / 700MHz = 132
123+
-gpgpu_frfcfs_dram_sched_queue_size 64
124+
-gpgpu_dram_return_queue_size 192
125+
126+
# for HBM, 32 channles, each (128 bits) 16 bytes width
127+
-gpgpu_n_mem_per_ctrlr 1
128+
-gpgpu_dram_buswidth 16
129+
-gpgpu_dram_burst_length 2
130+
-dram_data_command_freq_ratio 2 # HBM is DDR
131+
-gpgpu_mem_address_mask 1
132+
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBBCCC.CCCSSSSS
133+
134+
# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
135+
# Timing for 1 GHZ
136+
# tRRDl and tWTR are missing, need to be added
137+
#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
138+
# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
139+
140+
# Timing for 715 MHZ, Tesla Pascal P100 HBM runs at 715 MHZ
141+
-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=10:RAS=24:RP=10:RC=34:
142+
CL=10:WL=2:CDLR=3:WR=9:nbkgrp=4:CCDL=2:RTPL=3"
143+
144+
# HBM has dual bus interface, in which it can issue two col and row commands at a time
145+
-dual_bus_interface 1
146+
# select lower bits for bnkgrp to increase bnkgrp parallelism
147+
-dram_bnk_indexing_policy 0
148+
-dram_bnkgrp_indexing_policy 1
149+
150+
#-Seperate_Write_Queue_Enable 1
151+
#-Write_Queue_Size 64:56:32
152+
153+
# Pascal has two schedulers per core
154+
-gpgpu_num_sched_per_core 4
155+
# Two Level Scheduler with active and pending pools
156+
#-gpgpu_scheduler two_level_active:6:0:1
157+
# Loose round robbin scheduler
158+
#-gpgpu_scheduler lrr
159+
# Greedy then oldest scheduler
160+
-gpgpu_scheduler gto
161+
162+
# stat collection
163+
-gpgpu_memlatency_stat 14
164+
-gpgpu_runtime_stat 500
165+
-enable_ptx_file_line_stats 1
166+
-visualizer_enabled 0
167+
168+
# power model configs, disable it untill we create a real energy model for Pascal 100
169+
-power_simulation_enabled 0
170+
-gpuwattch_xml_file gpuwattch_gtx480.xml
171+
172+
# tracing functionality
173+
#-trace_enabled 1
174+
#-trace_components WARP_SCHEDULER,SCOREBOARD
175+
#-trace_sampling_core 0
176+
177+
# Multichip config
178+
#if you change the parition mapping, ensure you change the any_net file as well
179+
#if FT policy is used, then it is impo to use parition_mapping = 0 (i.e. consecutive)
180+
-multi_chip_mode 0
181+
-n_gpu_chips 1
182+
-mcm_partition_mapping 1
183+
-mcm_vm_ft_policy 0
184+
-mcm_vm_pagesize 2048
185+
-mcm_coarse_grain_cta_sched 0
186+
-mcm_cta_sched_grain 1
187+
-cache_remote_data 0
188+
-m_n_external 0
189+
-cache_remote_only_once 0
190+
-gpgpu_cache:dr2 S:64:128:8,L:E:m:N:L,A:256:4,32:0,32
191+
-remote_cache_latency 0
192+
-offchiplet_latency 0
193+
-gpgpu_flush_l1_cache 1
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
router 0 node 128
2+
router 1 node 129
3+
router 2 node 130
4+
router 3 node 131
5+
router 4 node 132
6+
router 5 node 133
7+
router 6 node 134
8+
router 7 node 135
9+
router 8 node 136
10+
router 9 node 137
11+
router 10 node 138
12+
router 11 node 139
13+
router 12 node 140
14+
router 13 node 141
15+
router 14 node 142
16+
router 15 node 143
17+
router 16 node 144
18+
router 17 node 145
19+
router 18 node 146
20+
router 19 node 147
21+
router 20 node 148
22+
router 21 node 149
23+
router 22 node 150
24+
router 23 node 151
25+
router 24 node 152
26+
router 25 node 153
27+
router 26 node 154
28+
router 27 node 155
29+
router 28 node 156
30+
router 29 node 157
31+
router 30 node 158
32+
router 31 node 159
33+
router 32 node 160
34+
router 33 node 161
35+
router 34 node 162
36+
router 35 node 163
37+
router 36 node 164
38+
router 37 node 165
39+
router 38 node 166
40+
router 39 node 167
41+
router 40 node 168
42+
router 41 node 169
43+
router 42 node 170
44+
router 43 node 171
45+
router 44 node 172
46+
router 45 node 173
47+
router 46 node 174
48+
router 47 node 175
49+
router 48 node 176
50+
router 49 node 177
51+
router 50 node 178
52+
router 51 node 179
53+
router 52 node 180
54+
router 53 node 181
55+
router 54 node 182
56+
router 55 node 183
57+
router 56 node 184
58+
router 57 node 185
59+
router 58 node 186
60+
router 59 node 187
61+
router 60 node 188
62+
router 61 node 189
63+
router 62 node 190
64+
router 63 node 191
65+
router 64 node 0 node 1 node 2 node 3 node 4 node 5 node 6 node 7 node 8 node 9 node 10 node 11 node 12 node 13 node 14 node 15 node 16 node 17 node 18 node 19 node 20 node 21 node 22 node 23 node 24 node 25 node 26 node 27 node 28 node 29 node 30 node 31 router 0 router 4 router 8 router 12 router 16 router 20 router 24 router 28 router 32 router 36 router 40 router 44 router 48 router 52 router 56 router 60 node 192 node 196 node 200 node 204 node 208 node 212 node 216 node 220 node 224 node 228 node 232 node 236 node 240 node 244 node 248 node 252
66+
router 65 node 32 node 33 node 34 node 35 node 36 node 37 node 38 node 39 node 40 node 41 node 42 node 43 node 44 node 45 node 46 node 47 node 48 node 49 node 50 node 51 node 52 node 53 node 54 node 55 node 56 node 57 node 58 node 59 node 60 node 61 node 62 node 63 router 1 router 5 router 9 router 13 router 17 router 21 router 25 router 29 router 33 router 37 router 41 router 45 router 49 router 53 router 57 router 61 node 193 node 197 node 201 node 205 node 209 node 213 node 217 node 221 node 225 node 229 node 233 node 237 node 241 node 245 node 249 node 253
67+
router 66 node 64 node 65 node 66 node 67 node 68 node 69 node 70 node 71 node 72 node 73 node 74 node 75 node 76 node 77 node 78 node 79 node 80 node 81 node 82 node 83 node 84 node 85 node 86 node 87 node 88 node 89 node 90 node 91 node 92 node 93 node 94 node 95 router 2 router 6 router 10 router 14 router 18 router 22 router 26 router 30 router 34 router 38 router 42 router 46 router 50 router 54 router 58 router 62 node 194 node 198 node 202 node 206 node 210 node 214 node 218 node 222 node 226 node 230 node 234 node 238 node 242 node 246 node 250 node 254
68+
router 67 node 96 node 97 node 98 node 99 node 100 node 101 node 102 node 103 node 104 node 105 node 106 node 107 node 108 node 109 node 110 node 111 node 112 node 113 node 114 node 115 node 116 node 117 node 118 node 119 node 120 node 121 node 122 node 123 node 124 node 125 node 126 node 127 router 3 router 7 router 11 router 15 router 19 router 23 router 27 router 31 router 35 router 39 router 43 router 47 router 51 router 55 router 59 router 63 node 195 node 199 node 203 node 207 node 211 node 215 node 219 node 223 node 227 node 231 node 235 node 239 node 243 node 247 node 251 node 255
69+
router 68 router 64 router 1 router 5 router 9 router 13
70+
router 69 router 64 router 17 router 21 router 25 router 29
71+
router 70 router 64 router 33 router 37 router 41 router 45
72+
router 71 router 64 router 49 router 53 router 57 router 61
73+
router 72 router 64 router 2 router 6 router 10 router 14
74+
router 73 router 64 router 18 router 22 router 26 router 30
75+
router 74 router 64 router 34 router 38 router 42 router 46
76+
router 75 router 64 router 50 router 54 router 58 router 62
77+
router 76 router 64 router 3 router 7 router 11 router 15
78+
router 77 router 64 router 19 router 23 router 27 router 31
79+
router 78 router 64 router 35 router 39 router 43 router 47
80+
router 79 router 64 router 51 router 55 router 59 router 63
81+
router 80 router 65 router 0 router 4 router 8 router 12
82+
router 81 router 65 router 16 router 20 router 24 router 28
83+
router 82 router 65 router 32 router 36 router 40 router 44
84+
router 83 router 65 router 48 router 52 router 56 router 60
85+
router 84 router 65 router 2 router 6 router 10 router 14
86+
router 85 router 65 router 18 router 22 router 26 router 30
87+
router 86 router 65 router 34 router 38 router 42 router 46
88+
router 87 router 65 router 50 router 54 router 58 router 62
89+
router 88 router 65 router 3 router 7 router 11 router 15
90+
router 89 router 65 router 19 router 23 router 27 router 31
91+
router 90 router 65 router 35 router 39 router 43 router 47
92+
router 91 router 65 router 51 router 55 router 59 router 63
93+
router 92 router 66 router 0 router 4 router 8 router 12
94+
router 93 router 66 router 16 router 20 router 24 router 28
95+
router 94 router 66 router 32 router 36 router 40 router 44
96+
router 95 router 66 router 48 router 52 router 56 router 60
97+
router 96 router 66 router 1 router 5 router 9 router 13
98+
router 97 router 66 router 17 router 21 router 25 router 29
99+
router 98 router 66 router 33 router 37 router 41 router 45
100+
router 99 router 66 router 49 router 53 router 57 router 61
101+
router 100 router 66 router 3 router 7 router 11 router 15
102+
router 101 router 66 router 19 router 23 router 27 router 31
103+
router 102 router 66 router 35 router 39 router 43 router 47
104+
router 103 router 66 router 51 router 55 router 59 router 63
105+
router 104 router 67 router 0 router 4 router 8 router 12
106+
router 105 router 67 router 16 router 20 router 24 router 28
107+
router 106 router 67 router 32 router 36 router 40 router 44
108+
router 107 router 67 router 48 router 52 router 56 router 60
109+
router 108 router 67 router 1 router 5 router 9 router 13
110+
router 109 router 67 router 17 router 21 router 25 router 29
111+
router 110 router 67 router 33 router 37 router 41 router 45
112+
router 111 router 67 router 49 router 53 router 57 router 61
113+
router 112 router 67 router 2 router 6 router 10 router 14
114+
router 113 router 67 router 18 router 22 router 26 router 30
115+
router 114 router 67 router 34 router 38 router 42 router 46
116+
router 115 router 67 router 50 router 54 router 58 router 62

0 commit comments

Comments
 (0)