@@ -2282,32 +2282,40 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
22822282 // bypass L1 cache
22832283 unsigned control_size =
22842284 inst.is_store () ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
2285- unsigned size = access.get_size () + control_size;
2286- // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
2287- if (m_memory_config->SST_mode &&
2288- (static_cast <sst_memory_interface *>(m_icnt)->full (
2289- size, inst.is_store () || inst.isatomic (), access.get_type ()))) {
2290- // SST need mf type here
2291- // Cast it to sst_memory_interface pointer first as this full() method
2292- // is not a virtual method in parent class
2293- stall_cond = ICNT_RC_FAIL;
2294- } else if (!m_memory_config->SST_mode &&
2295- (m_icnt->full (size, inst.is_store () || inst.isatomic ()))) {
2296- stall_cond = ICNT_RC_FAIL;
2297- } else {
2298- mem_fetch *mf =
2299- m_mf_allocator->alloc (inst, access,
2300- m_core->get_gpu ()->gpu_sim_cycle +
2301- m_core->get_gpu ()->gpu_tot_sim_cycle );
2302- m_icnt->push (mf);
2303- inst.accessq_pop_back ();
2304- // inst.clear_active( access.get_warp_mask() );
2305- if (inst.is_load ()) {
2306- for (unsigned r = 0 ; r < MAX_OUTPUT_VALUES; r++)
2307- if (inst.out [r] > 0 )
2308- assert (m_pending_writes[inst.warp_id ()][inst.out [r]] > 0 );
2309- } else if (inst.is_store ())
2310- m_core->inc_store_req (inst.warp_id ());
2285+ for (unsigned i = 0 ; i < m_config->m_L1D_config .l1_banks ; i++) {
2286+ if (inst.accessq_empty ()) {
2287+ break ;
2288+ }
2289+ const mem_access_t &access = inst.accessq_back ();
2290+ unsigned size = access.get_size () + control_size;
2291+ // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
2292+ if (m_memory_config->SST_mode &&
2293+ (static_cast <sst_memory_interface *>(m_icnt)->full (
2294+ size, inst.is_store () || inst.isatomic (), access.get_type ()))) {
2295+ // SST need mf type here
2296+ // Cast it to sst_memory_interface pointer first as this full() method
2297+ // is not a virtual method in parent class
2298+ stall_cond = ICNT_RC_FAIL;
2299+ break ;
2300+ } else if (!m_memory_config->SST_mode &&
2301+ (m_icnt->full (size, inst.is_store () || inst.isatomic ()))) {
2302+ stall_cond = ICNT_RC_FAIL;
2303+ break ;
2304+ } else {
2305+ mem_fetch *mf =
2306+ m_mf_allocator->alloc (inst, access,
2307+ m_core->get_gpu ()->gpu_sim_cycle +
2308+ m_core->get_gpu ()->gpu_tot_sim_cycle );
2309+ m_icnt->push (mf);
2310+ inst.accessq_pop_back ();
2311+ // inst.clear_active( access.get_warp_mask() );
2312+ if (inst.is_load ()) {
2313+ for (unsigned r = 0 ; r < MAX_OUTPUT_VALUES; r++)
2314+ if (inst.out [r] > 0 )
2315+ assert (m_pending_writes[inst.warp_id ()][inst.out [r]] > 0 );
2316+ } else if (inst.is_store ())
2317+ m_core->inc_store_req (inst.warp_id ());
2318+ }
23112319 }
23122320 } else {
23132321 assert (CACHE_UNDEFINED != inst.cache_op );
@@ -4534,41 +4542,55 @@ unsigned simt_core_cluster::get_n_active_sms() const {
45344542}
45354543
45364544unsigned simt_core_cluster::issue_block2core () {
4545+ const unsigned max_pending_ctas = 4 ;
4546+ for (unsigned core = 0 ; core < m_config->n_simt_cores_per_cluster ; core++) {
4547+ if (m_core[core]->pending_ctas .size () < max_pending_ctas) {
4548+ kernel_info_t *kernel;
4549+ // Jin: fetch kernel according to concurrent kernel setting
4550+ if (m_config->gpgpu_concurrent_kernel_sm ) { // concurrent kernel on sm
4551+ // always select latest issued kernel
4552+ kernel_info_t *k = m_gpu->select_kernel ();
4553+ kernel = k;
4554+ } else {
4555+ // first select core kernel, if no more cta, get a new kernel
4556+ // only when core completes
4557+ kernel = m_core[core]->get_kernel ();
4558+ if (!m_gpu->kernel_more_cta_left (kernel)) {
4559+ // wait till current kernel finishes
4560+ if (m_core[core]->get_not_completed () == 0 &&
4561+ m_core[core]->pending_ctas .empty ()) {
4562+ kernel_info_t *k = m_gpu->select_kernel ();
4563+ if (k) m_core[core]->set_kernel (k);
4564+ kernel = k;
4565+ }
4566+ }
4567+ }
4568+ if (kernel) {
4569+ if (kernel->allocated_ctas < kernel->num_blocks ()) {
4570+ m_core[core]->pending_ctas .push_back (kernel);
4571+ kernel->allocated_ctas ++;
4572+ }
4573+ }
4574+ }
4575+ }
4576+
45374577 unsigned num_blocks_issued = 0 ;
45384578 for (unsigned i = 0 ; i < m_config->n_simt_cores_per_cluster ; i++) {
45394579 unsigned core =
45404580 (i + m_cta_issue_next_core + 1 ) % m_config->n_simt_cores_per_cluster ;
45414581
4542- kernel_info_t *kernel;
4543- // Jin: fetch kernel according to concurrent kernel setting
4544- if (m_config->gpgpu_concurrent_kernel_sm ) { // concurrent kernel on sm
4545- // always select latest issued kernel
4546- kernel_info_t *k = m_gpu->select_kernel ();
4547- kernel = k;
4548- } else {
4549- // first select core kernel, if no more cta, get a new kernel
4550- // only when core completes
4551- kernel = m_core[core]->get_kernel ();
4552- if (!m_gpu->kernel_more_cta_left (kernel)) {
4553- // wait till current kernel finishes
4554- if (m_core[core]->get_not_completed () == 0 ) {
4555- kernel_info_t *k = m_gpu->select_kernel ();
4556- if (k) m_core[core]->set_kernel (k);
4557- kernel = k;
4558- }
4582+ if (m_core[core]->pending_ctas .size () > 0 ) {
4583+ kernel_info_t *pending_cta = m_core[core]->pending_ctas .front ();
4584+ if (m_core[core]->can_issue_1block (*pending_cta)) {
4585+ m_core[core]->issue_block2core (*pending_cta);
4586+ m_core[core]->pending_ctas .pop_front ();
4587+ num_blocks_issued++;
4588+ m_cta_issue_next_core = core;
4589+ break ;
45594590 }
45604591 }
4561-
4562- if (m_gpu->kernel_more_cta_left (kernel) &&
4563- // (m_core[core]->get_n_active_cta() <
4564- // m_config->max_cta(*kernel)) ) {
4565- m_core[core]->can_issue_1block (*kernel)) {
4566- m_core[core]->issue_block2core (*kernel);
4567- num_blocks_issued++;
4568- m_cta_issue_next_core = core;
4569- break ;
4570- }
45714592 }
4593+
45724594 return num_blocks_issued;
45734595}
45744596
0 commit comments